In [36]:
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1 ms


In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict

from natasha import NamesExtractor
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

time: 9.02 ms


In [39]:
import pandas as pd
import numpy as np
import re

from collections import Counter

time: 2 ms


In [40]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')
sample_submission = pd.read_csv('sample_submission.csv')
train.head()

Unnamed: 0,Word,Label
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0


time: 189 ms


In [41]:
vowels = ['а',  'я', 'ё', 'у','е', 'о', 'э', 'ю', 'и', 'ы', 'Ё', 'У', 'Е', 'Ы','А', 'О', 'Э', 'Ю', 'И', 'Я']
alphabet = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ы','ь','ъ','э','ю','я']

time: 2 ms


In [42]:
two_let_map = {}
k = 0
for i in alphabet:
    for j in alphabet:
        two_let_map['{}{}'.format(i,j)] = k
        k+=1

time: 6.01 ms


In [43]:
def map_two_last_letters(word):
    if len(word) < 2:
        return -2
    last_two = word.lower()[-2:]
    if last_two[0] not in alphabet or last_two[1] not in alphabet:
        return -1
    return two_let_map[last_two]

time: 4.01 ms


In [44]:
def countDoubles(word):
    l = [let for let in word.lower()]
    return len([(x,y) for x,y in zip(l, l[1:]) if x == y])

time: 2.01 ms


In [45]:
def is_contain_symb(word):
    word_chars = re.sub("[^а-яА-Я]+", "", word)
    return 0 if word_chars==word else 1

time: 2.01 ms


In [46]:
def is_caps(word):
    for c in word:
        if c.islower():
            return 0
    return 1

time: 2.01 ms


In [47]:
extractor = NamesExtractor()

def has_name(text):
    matches = extractor(text).as_json
    if matches !=[]:
        for match in matches:
            if match['fact']:
                if match['fact']['last']:
                    return 1
    return 0

time: 9.02 ms


In [48]:
words_all = pd.concat([train['Word'],test['Word']])

time: 10.1 ms


In [49]:
endings_3 = []
for w in words_all:
    if len(w)>3:
        endings_3.append(w[-3:])
    else:
        continue
        
cnt = Counter(endings_3)
freq_endings_3 = dict(cnt.most_common(1000))

def get_freq_endings_3(word):
    if len(word) >= 3:
        try:
            return freq_endings_3[word[-3:]]
        except:
            return 0
    else:
        return 0

time: 128 ms


In [50]:
start_2_end_2 = []
for w in words_all:
    if len(w)>4:
        start_2_end_2.append(w[:2]+w[-2:])
    else:
        continue
        
cnt = Counter(start_2_end_2)
freq_start_2_end_2 = dict(cnt.most_common(5000))

def get_freq_start_2_end_2(word):
    if len(word) >= 4:
        try:
            return freq_start_2_end_2[word[:2]+word[-2:]]
        except:
            return 0
    else:
        return 0

time: 176 ms


In [69]:
def prepare_features(df):
    df['Lenght'] = df['Word'].apply(lambda x: len(x))
    df['Vowels'] = df['Word'].apply(lambda x: sum(1 if l in vowels else 0 for l in x))
    df['Consonants'] = df['Lenght'] - df['Vowels']
    df['Not_null_Consonants'] = df['Consonants'].apply(lambda x: 0.001 if x==0 else x)
    df['Vow/Conson'] = (df['Vowels'] / df['Not_null_Consonants']).astype(int)
    df = df.drop(columns=['Not_null_Consonants'])
    df['is_lower'] = df['Word'].apply(lambda x: 1 if x[0] == x[0].lower() else 0)
    df['Double'] = df['Word'].apply(lambda x: countDoubles(x))
    df['last_two_let'] = df['Word'].apply(lambda word : map_two_last_letters(word))
    df['contain_symb'] = df['Word'].apply(lambda word: is_contain_symb(word))
    df['is_caps'] = df['Word'].apply(lambda word: is_caps(word))
    df['freq_endings_3'] = df['Word'].apply(lambda word: get_freq_endings_3(word))
    df['freq_start_2_end_2'] = df['Word'].apply(lambda word: get_freq_start_2_end_2(word))
    df['is_name_by_nata'] = df['Word'].apply(lambda word: has_name(word))
    return df

time: 28.1 ms


In [55]:
train_all = prepare_features(train)
train_all.head()
test_all = prepare_features(test)

time: 2.84 s


In [56]:
train_all.head()

Unnamed: 0,Word,Label,Lenght,Vowels,Consonants,Vow/Conson,is_lower,Double,last_two_let,contain_symb,is_caps,freq_endings_3,freq_start_2_end_2
0,Аалтонен,1,8,4,4,1,0,1,179,0,0,0,0
1,Аар,0,3,2,1,2,0,1,17,0,0,0,0
2,Аарон,0,5,3,2,1,0,1,509,0,0,100,0
3,ААРОН,0,5,3,2,1,0,1,509,0,1,0,0
4,Аарона,0,6,4,2,2,0,1,462,0,0,1052,0


time: 11 ms


Stacking

In [26]:
class proba_logreg(LogisticRegression):
    def predict(self, X):
        return LogisticRegression.predict_proba(self, X)
    
class proba_lgbm(LGBMClassifier):
    def predict(self, X):
        return LGBMClassifier.predict_proba(self, X)
    
class proba_mnb(MultinomialNB):
    def predict(self, X):
        return MultinomialNB.predict_proba(self, X)
    
class proba_knn(KNeighborsClassifier):
    def predict(self, X):
        return KNeighborsClassifier.predict_proba(self, X)

time: 11.5 ms


In [70]:
def stack(x_train_cur,y_train,x_test,y_test):
    
    char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 3),
    max_features=50000)
        
    x_train_char = x_train_cur['Word']#.apply(lambda x: x.lower())
    x_test_char = x_test['Word']#.apply(lambda x: x.lower())
    char_vectorizer.fit(x_train_char)
    train_char_features = char_vectorizer.transform(x_train_char)
    test_char_features = char_vectorizer.transform(x_test_char)
    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(x_train_cur.drop(columns=['last_two_let','Word']))
    x_test_std = scaler.transform(x_test.drop(columns=['last_two_let','Word']))
    
    
    x_train_cur = x_train_cur.drop(columns=['Word'])
    x_test = x_test.drop(columns=['Word'])
    
    gbayes = proba_mnb(alpha=100)
    y_pred_gbayes = cross_val_predict(gbayes, x_train_cur.drop(columns=['last_two_let']), y_train, cv=3)
    gbayes.fit(x_train_cur.drop(columns=['last_two_let']), y_train)
    gbayes2 = proba_mnb(alpha=1)
    y_pred_gbayes2 = cross_val_predict(gbayes2, train_char_features, y_train, cv=3)
    gbayes2.fit(train_char_features, y_train)
    lgbm = proba_lgbm(n_estimators=300)
    y_pred_lgbm = cross_val_predict(lgbm, x_train_cur, y_train, cv=3)
    lgbm.fit(x_train_cur,y_train)
    logreg_char = proba_logreg(C=0.15)
    y_pred_logreg = cross_val_predict(logreg_char, train_char_features, y_train, cv=3)
    logreg_char.fit(train_char_features,y_train)
    kn = proba_knn(40)
    y_pred_kn = cross_val_predict(kn, x_train_std, y_train, cv=3)
    kn.fit(x_train_std,y_train)

    new_x_train = pd.DataFrame()
    new_x_train['lgbm'] = y_pred_lgbm[:,1]
    new_x_train['bayes'] = y_pred_gbayes[:,1]
    new_x_train['bayes2'] = y_pred_gbayes2[:,1]
    new_x_train['logreg'] = y_pred_logreg[:,1]
    new_x_train['kn'] = y_pred_kn[:,1]
    new_x_train = pd.concat([new_x_train.reset_index(drop=True),x_train_cur.reset_index(drop=True)], axis=1, ignore_index=True)

    xgb = XGBClassifier()
    xgb.fit(new_x_train, y_train)

    y_pred_gbayes = gbayes.predict_proba(x_test.drop(columns=['last_two_let']))
    y_pred_gbayes2 = gbayes2.predict_proba(test_char_features)
    y_pred_lgbm = lgbm.predict_proba(x_test)
    y_pred_logreg = logreg_char.predict_proba(test_char_features)
    y_pred_kn = kn.predict_proba(x_test_std)
    
    new_x_test = pd.DataFrame()
    new_x_test['lgbm'] = y_pred_lgbm[:,1]
    new_x_test['bayes'] = y_pred_gbayes[:,1]
    new_x_test['bayes2'] = y_pred_gbayes2[:,1]
    new_x_test['logreg'] = y_pred_logreg[:,1]
    new_x_test['kn'] = y_pred_kn[:,1]
    new_x_test = pd.concat([new_x_test.reset_index(drop=True),x_test.reset_index(drop=True)], axis=1, ignore_index=True)
    
    y_pred = xgb.predict_proba(new_x_test)
    
    score = 0
    if y_test is not None:
        score = roc_auc_score(y_test, y_pred[:,1])
    
    return score, y_pred[:,1]


time: 133 ms


Cross validation

In [71]:
roc_auc_all = []
x_train = train_all.drop(columns=['Label'])
y_train = train_all[['Label']]
kf = StratifiedKFold(n_splits=5)
for train_idx, test_idx in kf.split(x_train, y_train):
    x_train_cur = x_train.loc[train_idx]
    y_train_cur = y_train.loc[train_idx].values.ravel()
    x_test_cur = x_train.loc[test_idx]
    y_test_cur = y_train.loc[test_idx].values.ravel()
    
    stacked = stack(x_train_cur, y_train_cur, x_test_cur, y_test_cur)
    roc_auc_all.append(stacked[0])
print(roc_auc_all)
print('Mean: {}'.format(np.mean(roc_auc_all)))

[0.9064230573058735, 0.8978151809580688, 0.8973256220174763, 0.868256740287958, 0.9056078049062455]
Mean: 0.8950856810951244
time: 3min 39s


Submition

In [31]:
x_train_cur = train_all.drop(columns=['Label'])
y_train = train_all[['Label']]
x_test = test_all

_, predict = stack(x_train, y_train.values.ravel(), x_test, None)

time: 2min 14s


In [32]:
submit = pd.DataFrame()
submit['Id'] = test.index
submit['Prediction'] = predict
submit[:5]

Unnamed: 0,Id,Prediction
0,0,0.288484
1,1,0.297076
2,2,0.221595
3,3,0.12403
4,4,0.174292


time: 21.6 ms


In [33]:
submit.to_csv('benchmark.csv', index = False)

time: 492 ms
