In [1]:
import gensim
import annoy
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_distances

In [2]:
data = pd.read_csv('train.csv')

In [44]:
punct = punctuation+'«»—…“”*№–'

stops = set(stopwords.words('english'))

def normalize(text):
    """Принимает строку, возвращает список строк.
       Разбивает текст по пробельным символам, приводит к нижнему регистру
       и удаляет пунктуацию на концах токенов.
       
    """
    words = [word.strip(punct) for word in text.lower().split()]
    words = [word for word in words if word]


    return words


In [45]:
data['norm'] = data.question_text.apply(normalize)

In [15]:
ft = gensim.models.FastText(data.norm, size=100, max_vocab_size=100000)

In [26]:
vocab = Counter()
for sent in data.norm:
    vocab.update(sent)
vocab = {v for v,c in vocab.most_common(30000)}

In [28]:
X = np.zeros((len(vocab), 100))
id2word = {i:word for i, word in enumerate(vocab)}
word2id = {word:i for i, word in enumerate(vocab)}
for i,word in id2word.items():
    try:
        vec = ft[word]
    except (KeyError, ValueError):
        continue
    
    X[i] = vec


  


In [40]:
%%time
cosine_distances(X[1].reshape(-1,1), X.reshape(-1,1))

CPU times: user 2.94 s, sys: 1.18 s, total: 4.12 s
Wall time: 1.63 s


array([[2., 0., 2., ..., 2., 2., 0.],
       [2., 0., 2., ..., 2., 2., 0.],
       [2., 0., 2., ..., 2., 2., 0.],
       ...,
       [2., 0., 2., ..., 2., 2., 0.],
       [0., 2., 0., ..., 0., 0., 2.],
       [2., 0., 2., ..., 2., 2., 0.]])

In [45]:
index = annoy.AnnoyIndex(100)

In [46]:
for i, vec in enumerate(X):
    index.add_item(i, vec)

In [47]:
index.build(10)

True

In [55]:
%%time
index.get_nns_by_vector(vec, 10, search_k=1000)

CPU times: user 299 µs, sys: 10 µs, total: 309 µs
Wall time: 193 µs


[29999, 27304, 19971, 26637, 3986, 8951, 15476, 23461, 134, 5563]

## ELI5

In [79]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score

In [49]:
texts = data.norm.apply(' '.join)

In [71]:
vec = TfidfVectorizer(max_features=10000, min_df=10)
X = vec.fit_transform(texts)
y = data.target.values

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [94]:
clf = LogisticRegression(C=100)

In [95]:
clf.fit(X_train, y_train)



LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [96]:
preds = clf.predict(X_test)
print(f1_score(y_test, preds))

0.5586627370694504


In [106]:
?eli5.show_weights(clf, vec=vec, top=50, )

In [108]:
eli5.show_prediction(clf, '. '.join(texts[20:50]), vec=vec)

Contribution?,Feature
6.169,<BIAS>
-5.769,Highlighted in text (sum)


In [43]:
texts[50]

'ways shorten period risks'

## Stacking

In [110]:
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeClassifier

In [None]:
vec = TfidfVectorizer(max_features=10000, min_df=10)
X = vec.fit_transform(texts)
y = data.target.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [157]:
clfs = [ LogisticRegression(class_weight='balanced'),
        LogisticRegression(), LogisticRegression(C=100), LogisticRegression(C=1000), 
       LogisticRegression(C=100, penalty='l1'), 
       ]

In [158]:
oof_preds = np.zeros((5, X_train.shape[0]))
test_preds = np.zeros((5, X_test.shape[0]))


In [159]:
kf = KFold(n_splits=3)
f1s = []
for i, clf in enumerate(clfs):
    test = np.zeros(X_test.shape[0])
    print('CLF - ', i)
    for j, (train_index, test_index) in enumerate(kf.split(X_train)):
        clf.fit(X_train[train_index], y_train[train_index])
        preds_prob = clf.predict_proba(X_train[test_index])[:,1]
        preds = clf.predict(X_train[test_index])
        
        oof_preds[i][test_index] = preds_prob
        f1s.append(f1_score(y_train[test_index], preds))
        test += clf.predict_proba(X_test)[:,1]
        print('Fold ', j)
        print('Error - ', f1_score(y_train[test_index], preds))
        print('Total error - ', np.mean(f1s))
    
    test /= 3
    test_preds[i] = test

CLF -  0




Fold  0
Error -  0.5083139182331726
Total error -  0.5083139182331726
Fold  1
Error -  0.5043550261938504
Total error -  0.5063344722135115
Fold  2
Error -  0.5050753712614096
Total error -  0.5059147718961442
CLF -  1
Fold  0
Error -  0.5129437998676005
Total error -  0.5076720288890082
Fold  1
Error -  0.5165562913907286
Total error -  0.5094488813893523
Fold  2
Error -  0.5101305245477132
Total error -  0.5095624885824125
CLF -  2
Fold  0
Error -  0.5454249005413161
Total error -  0.5146856902908272
Fold  1
Error -  0.5554292396141229
Total error -  0.5197786339562392
Fold  2
Error -  0.5444701738364697
Total error -  0.522522138387376
CLF -  3
Fold  0
Error -  0.5449811222497071
Total error -  0.5247680367736092
Fold  1
Error -  0.5547232221033175
Total error -  0.5274912354399462
Fold  2
Error -  0.5441942202551419
Total error -  0.5288831508412125
CLF -  4
Fold  0
Error -  0.5448563424327317
Total error -  0.5301118578867141
Fold  1
Error -  0.5549031692996399
Total error -  0.53

In [160]:
meta_clf = LogisticRegression()
meta_clf.fit(oof_preds.T, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [161]:
preds = meta_clf.predict(test_preds.T)

In [162]:
print(f1_score(y_test, preds))

0.5731079771773064
