In [1]:
import numpy as np
import pandas as pd
import string
import nltk

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.shape

(135309, 4)

### Tokenization and data cleaning

In [4]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

[nltk_data] Downloading package stopwords to /home/espero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
word_tokenizer = nltk.WordPunctTokenizer()

In [6]:
def cleaning_data(data):
    ans = []
    for i in range(data.shape[0]):
        title_lower = data.loc[i]['title'].lower()
        url_lower = data.loc[i]['url'].lower()
        
        title_tokens = word_tokenizer.tokenize(title_lower)
        url_tokens = word_tokenizer.tokenize(url_lower)
        
        title_tokens = [word for word in title_tokens if (word not in string.punctuation and word not in stop_words)]
        url_tokens = [word for word in url_tokens if (word not in string.punctuation and word not in stop_words)]
        ans.append(url_tokens + title_tokens)
    return ans

In [7]:
X_train = cleaning_data(train_df)
X_test = cleaning_data(test_df)
y_train = y_train = train_df['target'].astype(int).values

### Stemming

In [8]:
from nltk.stem.snowball import SnowballStemmer 
stemmer = SnowballStemmer("russian")

In [9]:
def stemming(input_data):
    data = input_data
    for i, arr in enumerate(data):
        stemmed = list(map(stemmer.stem, arr))
        data[i] = ' '.join(stemmed)
        
#         for j, value in enumerate(arr):
#             data[i][j] = stemmer.stem(value)
#         for j, value in enumerate(arr[1]):
#             data[i][1][j] = stemmer.stem(value)
    return data

In [None]:
X_train_stem = stemming(X_train)
X_test_stem = stemming(X_test)

### LSA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
vectorizer = TfidfVectorizer(max_features = 40000)
vectorizer.fit(X_train_stem)

In [None]:
X_train_vect = vectorizer.transform(np.array(X_train_stem))
X_test_vect = vectorizer.transform(np.array(X_test_stem))

In [None]:
X_train_vect.shape

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 1000)
svd.fit(X_train_vect)

In [None]:
X_train_svd = svd.transform(X_train_vect)
X_test_svd  = svd.transform(X_test_vect)

### Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train_svd, train_y)

In [None]:
predict = model.predict(X_test_svd)
proba   = model.predict_proba(X_test_svd)

from sklearn.metrics import accuracy_score, roc_auc_score
print("ACCURACY = {}".format(accuracy_score(test_y, predict)))
print("ROC-AUC =  {}".format(roc_auc_score(test_y, proba[:, 1])))