In [20]:
import pandas as pd
import numpy as np
import pymorphy2
from sklearn import metrics

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression # для сравнения с линейной моделью
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import gensim
from gensim.models import word2vec
from tqdm._tqdm_notebook import tqdm_notebook
import gc

from textwrap import wrap

# Читаем dataset'ы train & test и объединяем в один для word2vec

In [2]:
fname = r'linear_train.txt'
df = pd.read_csv(fname, names=['surname', 'target'])
df_test = pd.read_csv('linear_test.txt',names=['surname', 'target'])
df_for_w2v = pd.concat([df, df_test])


# Токенизируем

In [3]:
surname_list_test = list(df_test['surname'].values)
surname_list_train = list(df['surname'].values)

## Список списков из частичек слов

In [4]:
#укажите количество символов, на которое будем разбивать слова
n = 3

surname_tokens = []
tokens_set = set()
for surname in surname_list_train+surname_list_test:
    tmp = wrap(surname, n)
    surname_tokens.append(tmp)
    for i in tmp:
        tokens_set.add(i)

# Обучаем модель word2vec на списке строк

In [5]:
size_model=200
model = gensim.models.Word2Vec(surname_tokens,
                               sg=1,
                               size=size_model,
                               workers=8,
                               min_count=1,
                               window=4)

# Затестим CountVectorizer

In [3]:
#укажите кол-во букв, по которым будем токенизировать tokenizer=lambda x: wrap(x,n_char),
n_char = 2
count_vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,4), max_features=200, encoding='utf8')
for_tf_idf = count_vectorizer.fit_transform(df_for_w2v.surname.apply(str.lower).tolist())
X1 = TfidfTransformer().fit_transform(for_tf_idf)

In [6]:
count_vectorizer.get_feature_names()

[' ',
 ' а',
 ' б',
 ' в',
 ' г',
 ' д',
 ' з',
 ' и',
 ' к',
 ' ко',
 ' л',
 ' м',
 ' н',
 ' о',
 ' п',
 ' по',
 ' пр',
 ' р',
 ' с',
 ' т',
 ' у',
 ' ф',
 'а',
 'а ',
 'ав',
 'ад',
 'ак',
 'ал',
 'ам',
 'ам ',
 'ами',
 'ами ',
 'ан',
 'ани',
 'ар',
 'ас',
 'ат',
 'ах',
 'ах ',
 'б',
 'ба',
 'бо',
 'в',
 'в ',
 'ва',
 'ван',
 'вани',
 'ве',
 'ви',
 'во',
 'г',
 'га',
 'го',
 'д',
 'да',
 'де',
 'ди',
 'до',
 'е',
 'е ',
 'ев',
 'ед',
 'ей',
 'ек',
 'ел',
 'ем',
 'ем ',
 'ен',
 'ени',
 'ер',
 'ес',
 'ет',
 'ж',
 'з',
 'за',
 'и',
 'и ',
 'ив',
 'ие',
 'ие ',
 'из',
 'ии',
 'ии ',
 'ик',
 'ика',
 'ил',
 'ин',
 'ир',
 'ис',
 'ит',
 'ия',
 'ия ',
 'й',
 'й ',
 'к',
 'к ',
 'ка',
 'ка ',
 'кам',
 'ке',
 'ки',
 'ки ',
 'ко',
 'ку',
 'л',
 'ла',
 'ле',
 'ли',
 'ло',
 'ль',
 'м',
 'м ',
 'ма',
 'ме',
 'ми',
 'ми ',
 'мо',
 'н',
 'на',
 'не',
 'ни',
 'ние',
 'ние ',
 'ник',
 'ния',
 'но',
 'нос',
 'ност',
 'нт',
 'о',
 'об',
 'ов',
 'ов ',
 'ова',
 'ог',
 'од',
 'ой',
 'ой ',
 'ок',
 'ол',
 'о

In [4]:
X1_arr = X1.toarray()

## Добавим полученные признаки в датасет

In [5]:
df_for_w2v= df_for_w2v.join(pd.DataFrame(X1_arr))

# Затестим TF-IDF

In [3]:
tf_idf_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4), min_df = 1, max_features=500)
X1 = tf_idf_vectorizer.fit_transform(df_for_w2v['surname'])

In [4]:
X1_arr = X1.toarray()

In [20]:
X1_arr.shape

(290328, 500)

## Добавим полученные признаки в датасет

In [5]:
df_for_w2v= df_for_w2v.join(pd.DataFrame(X1_arr))

In [6]:
tf_idf_vectorizer.get_feature_names()

[' а',
 ' б',
 ' ба',
 ' бе',
 ' бо',
 ' в',
 ' ве',
 ' во',
 ' вы',
 ' г',
 ' го',
 ' гр',
 ' д',
 ' де',
 ' ди',
 ' до',
 ' ж',
 ' з',
 ' за',
 ' и',
 ' ин',
 ' к',
 ' ка',
 ' ко',
 ' кон',
 ' кр',
 ' л',
 ' м',
 ' ма',
 ' ме',
 ' ми',
 ' мо',
 ' н',
 ' на',
 ' не',
 ' о',
 ' об',
 ' от',
 ' п',
 ' па',
 ' пе',
 ' пер',
 ' по',
 ' под',
 ' пр',
 ' пре',
 ' при',
 ' про',
 ' р',
 ' ра',
 ' ре',
 ' с',
 ' са',
 ' се',
 ' со',
 ' ст',
 ' т',
 ' те',
 ' тр',
 ' у',
 ' ф',
 ' х',
 ' ч',
 ' ш',
 ' э',
 'а ',
 'аб',
 'ав',
 'аг',
 'ад',
 'аж',
 'аз',
 'ай',
 'ак',
 'ал',
 'али',
 'аль',
 'ам',
 'ам ',
 'ами',
 'ами ',
 'ан',
 'ана',
 'ани',
 'ание',
 'ания',
 'ант',
 'ап',
 'ар',
 'ара',
 'ари',
 'ас',
 'аст',
 'ат',
 'ате',
 'ател',
 'ато',
 'атор',
 'ах',
 'ах ',
 'ац',
 'аци',
 'ач',
 'ба',
 'бе',
 'би',
 'бл',
 'бо',
 'бр',
 'бу',
 'в ',
 'ва',
 'ва ',
 'ван',
 'вани',
 'ве',
 'вер',
 'ви',
 'вк',
 'вл',
 'во',
 'ву',
 'вы',
 'га',
 'ге',
 'ги',
 'го',
 'гр',
 'гра',
 'гу',
 'да',
 'де'

# Осредняем вектора для каждого списка в списке

In [6]:
surname_vectors = []
for sur in surname_tokens:
    summ = 0
    for i in range(len(sur)):
        summ += model[sur[i]]
    surname_vectors.append(summ/len(sur))
surname_vectors = (np.array(surname_vectors)).transpose()

# Закидываем полученные признаки в dataset

In [7]:
for col_name, obj in zip(range(size_model), surname_vectors):
    df_for_w2v[str(col_name)] = obj

In [8]:
df_for_w2v.head()

Unnamed: 0,surname,target,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,Аалтонен,1.0,0.130743,-0.07167,0.076392,0.040199,0.023325,0.03511,0.058669,-0.148136,...,-0.148236,0.04139,0.135459,-0.136253,-0.013069,0.072999,-0.123382,-0.082432,-0.038002,0.123685
1,Аар,0.0,0.017861,-0.008172,0.092007,0.008049,0.042702,0.002236,0.045443,-0.047784,...,-0.099428,0.029325,0.102985,-0.007652,-0.004967,0.051442,-0.032436,-0.174516,-0.010901,0.009066
2,Аарон,0.0,0.142619,-0.058156,0.108189,0.117475,-0.034743,-0.053019,0.087382,-0.115464,...,-0.121473,0.005886,0.04782,-0.037615,-0.067349,0.058381,-0.036318,-0.135929,-0.028989,0.076096
3,ААРОН,0.0,0.037458,-0.053239,0.117955,0.03019,-0.042377,0.0527,0.09985,-0.077576,...,-0.04707,0.014399,0.079604,0.046148,0.023387,-0.003161,0.038894,-0.039309,-0.053187,-0.084351
4,Аарона,0.0,0.084017,-0.171743,0.112865,0.07275,-0.039378,0.015014,0.074812,-0.174842,...,-0.202106,0.122581,0.09587,-0.111063,-0.271469,0.117214,-0.014134,-0.107673,0.041313,0.083361


# Добавим собственные признаки

In [8]:
df_for_w2v['2end'] = df_for_w2v['surname'].str[-2:].str.lower()
df_for_w2v['2end'] = df_for_w2v['surname'].str[-2:].str.lower()

In [10]:
skonca=5
def ovev(Famils):
    num=[]
    for els in Famils:    
        if ('ов' in els[-skonca:]) or ('ОВ' in els[-skonca:]) or ('ев' in els[-skonca:]) or ('ЕВ' in els[-skonca:]) or ('ёв' in els[-skonca:]) or ('ЁВ' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))
    
def inyn(Famils):
    num=[]
    for els in Famils:
        if ('ин' in els[-skonca:]) or ('ИН' in els[-skonca:]) or ('ын' in els[-skonca:]) or ('ЫН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))
    
def skck(Famils):
    num=[]
    for els in Famils:
        if ('ск' in els[-skonca:]) or ('СК' in els[-skonca:]) or('цк' in els[-skonca:]) or ('ЦК' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def bigl(Famils):
    num=[]
    for els in Famils:
        if els[0] in 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯ':
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def glasn2(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==2:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def glasn3(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==3:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def glasn4(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==4:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def slogs(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        num.append(j)
        j=0
    return(np.array(num))

def lens(Famils):
    num=[]
    for els in Famils:    
        num.append(len(els))
    return(np.array(num))

def anyan(Famils):
    num=[]
    for els in Famils:
        if ('ан' in els[-skonca:]) or ('АН' in els[-skonca:]) or ('ян' in els[-skonca:]) or ('ЯН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def nko(Famils):
    num=[]
    for els in Famils:
        if ('нко' in els[-skonca:]) or ('нк' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def ayn(Famils):
    num=[]
    for els in Famils:
        if ('айн' in els[-skonca:]) or ('АЙН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def rs(Famils):
    num=[]
    for els in Famils:
        if ('рс' in els[-skonca:]) or ('РС' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))



In [11]:
df_for_w2v['ovev']=ovev(df_for_w2v.surname)
df_for_w2v['inyn']=inyn(df_for_w2v.surname)
df_for_w2v['skck']=skck(df_for_w2v.surname)
#df_for_w2v['bigl']=bigl(df_for_w2v.surname)
# df_sorted_cut['glasn2']=glasn2(df_sorted_cut.surname)
# df_sorted_cut['glasn3']=glasn3(df_sorted_cut.surname)
# df_sorted_cut['glasn4']=glasn4(df_sorted_cut.surname)
df_for_w2v['slogs']=slogs(df_for_w2v.surname)
#df_for_w2v['lens'] = lens(df_for_w2v.surname)
#df_for_w2v['anyan']=anyan(df_for_w2v.surname)
df_for_w2v['nko']=nko(df_for_w2v.surname)
#df_for_w2v['ayn'] = ayn(df_for_w2v.surname)
#df_for_w2v['rs']=rs(df_for_w2v.surname)

In [12]:
volwes = 'аеёиоуыэюя'
consonant = 'бвгджзйклмнпрстфхцчшщъьа'
alphabet = set(volwes) | set(consonant)

good_ends = ['цкий',  'ский', 'цкая', 'ская',
             'нен', 'ына', 'ина', 'ёва', 'ева', 'ова',
             'ым',  'на', 'ых', 'их', 'ын', 'ин', 'ёв', 'ев', 'ов']

In [13]:
def symbols(sentence , n):
    res = []
    for s in sentence:
        res.extend(list(s.lower()))
    return set(res)


all_symbols = symbols(df_for_w2v.surname.values , 1)
bad_symbols = all_symbols ^ alphabet

In [17]:
for s in tqdm_notebook(good_ends):
    df_for_w2v['last_' + s] = df_for_w2v['surname'].apply(lambda x: x[-len(s):].lower() == s).astype('int8')

for s in tqdm_notebook(alphabet):
    df_for_w2v['count_' + s] = df_for_w2v['surname'].apply(lambda x: x.lower().count(s)).astype('int8')

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [22]:
n_of_gramms = [1,2] 
index_smoof = [4,6]
for j , n in tqdm_notebook(zip(n_of_gramms , index_smoof) , total=len(n_of_gramms)):
    for i in range(n):
        col_forward = 'forward_' +str(j) + '_' + str(i)
        df_for_w2v[col_forward] = df_for_w2v['surname'].apply(str.lower).apply(lambda x: x[i:i+j] if len(x) > i + j - 1 else 'nan')
        df_for_w2v[col_forward] = LabelEncoder().fit_transform(df_for_w2v[col_forward])

        col_reverse = 'reverse_' +str(j) + '_' + str(i)
        df_for_w2v[col_reverse] = df_for_w2v['surname'].apply(str.lower).apply(lambda x: x[::-1][i:i+j]  if len(x) > i + j - 1 else 'nan')
        df_for_w2v[col_reverse] = LabelEncoder().fit_transform(df_for_w2v[col_reverse])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [15]:
from tqdm._tqdm_notebook import tqdm_notebook

def vowels_count(x):
    return len([s for s in x.lower() if s in volwes])

def consonant_count(x):
    return len([s for s in x.lower() if s in consonant])

def divide_vov_by_cons(x):
    return vowels_count(x)/(consonant_count(x) + 0.001)

def bad_count(x):
    return len([s for s in x.lower() if s in bad_symbols])

def have_good_end(x):
    if (x[-2:] in good_ends) or (x[-3:] in good_ends) or (x[-4:] in good_ends):
        return 1
    return 0

func = [str.isupper , str.istitle , len , vowels_count , consonant_count, divide_vov_by_cons, bad_count , have_good_end]
columns = ['isupper' , 'istitle' , 'len' , 'vowels_count' , 'consonant_count' , 'divide_vov_by_cons' ,'bad_count' , 'have_good_end']
for f ,col in tqdm_notebook(zip(func, columns) , total = len(func)):
    df_for_w2v[col] = df_for_w2v['surname'].apply(f).astype('int8')

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [40]:
morph = pymorphy2.MorphAnalyzer()

def name_score(word):
    for p in morph.parse(word):
        if 'Name' in p.tag:
            return p.score
    return 0

def surn_score(word):
    for p in morph.parse(word):
        if 'Surn' in p.tag:
            return p.score
    return 0

df_for_w2v['pymorphy_word_is_known'] = df_for_w2v['surname'].apply(morph.word_is_known).astype('int8')
df_for_w2v['pymorphy_count_in_tag'] = df_for_w2v['surname'].apply(lambda x: len(morph.tag(x))).astype('int8')
df_for_w2v['pymorphy_score'] = df_for_w2v['surname'].apply(lambda x: morph.parse(x)[0].score)
df_for_w2v['pymorphy'] = df_for_w2v['surname'].apply(lambda x: morph.tag(x)[0])

df_for_w2v['pymorphy_animacy'] = df_for_w2v['pymorphy'].apply(lambda x: x.animacy)
df_for_w2v['pymorphy_POS'] = df_for_w2v['pymorphy'].apply(lambda x: x.POS)
df_for_w2v['pymorphy_case'] = df_for_w2v['pymorphy'].apply(lambda x: x.case)
df_for_w2v['pymorphy_number'] = df_for_w2v['pymorphy'].apply(lambda x: x.number)
df_for_w2v['pymorphy_gender'] = df_for_w2v['pymorphy'].apply(lambda x: x.gender)

df_for_w2v['pymorphy_name_score'] = df_for_w2v['surname'].apply(name_score)
df_for_w2v['pymorphy_surn_score'] = df_for_w2v['surname'].apply(surn_score)

columns_to_one_hot = ['pymorphy' , 'pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    df_for_w2v[col] = LabelEncoder().fit_transform(list(df_for_w2v[col].fillna('nan')))

# Сделаем выборку 50/50 из всех 1 и рандомных 0

In [21]:
df_1 = df_for_w2v[df_for_w2v['target']==1]
df_0 = df_for_w2v[df_for_w2v['target']==0].sample(10000, random_state=1)
df_without_1 = df_for_w2v[df_for_w2v['target'] != 1]
df_test = df_without_1[df_without_1['target'] != 0]
# объединяем два dataframe выше + перемешиваем рандомно строчки
df_sorted_cut = pd.concat([df_1, df_0],
                          ignore_index=True
                         ).sample(frac=1)

## Альтернативная версия на всю тестовую выборку 100к

In [41]:
df_1 = df_for_w2v[df_for_w2v['target']==1]
df_0 = df_for_w2v[df_for_w2v['target']==0]
df_without_1 = df_for_w2v[df_for_w2v['target'] != 1]
df_test = df_without_1[df_without_1['target'] != 0]
df_sorted_cut = pd.concat([df_1, df_0],
                          ignore_index=True).sample(frac=1)

# Выделим матрицу признаков и ответов

In [42]:
features=list(df_sorted_cut.columns)[2:]

In [43]:
train=df_sorted_cut

In [44]:
train_full = df_sorted_cut[features]

In [45]:
train_full.shape

(101408, 297)

In [46]:
test_full = df_test[features]

In [47]:
test_full.shape

(188920, 297)

In [8]:
X =[]
for name in features:
    X.append(df_sorted_cut[name].values)
X = np.array(X)
X = np.transpose(X)
y = df_sorted_cut['target'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

In [13]:
df_sorted_cut

Unnamed: 0,surname,target,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
64171,пирсах,0.0,0.166958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
28910,дизайнеру,0.0,0.144685,0.000000,0.000000,0.000000,0.000000,0.293013,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
44856,куплетист,0.0,0.160355,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
32609,Задержкой,0.0,0.133826,0.000000,0.000000,0.000000,0.000000,0.000000,0.307261,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
73745,радиостудий,0.0,0.118986,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
100560,Эссенция,0.0,0.150140,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.334533,0.000000,0.0,0.238111,0.308555,0.0
74053,Раздачей,0.0,0.155514,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
70517,Принцу,0.0,0.181638,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
38954,кастрюле,0.0,0.171101,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.328316,0.0,0.000000,0.000000,0.0
40864,койка,0.0,0.167395,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0


# Логистическая регрессия

In [14]:
log_reg = LogisticRegression(penalty='l2', n_jobs=4, random_state=42,
                             class_weight='balanced',
                             C = 0.1, max_iter=50, solver='sag')

In [15]:
log_reg.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
log_reg_predictions = log_reg.predict(X_test)

In [17]:
metrics.roc_auc_score(y_test, log_reg_predictions)

0.7527606860425768

# Random Forest

In [18]:
rnd_frst = RandomForestClassifier(n_estimators=50, criterion='entropy')

In [19]:
rnd_frst.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
rnd_frst_predictions = rnd_frst.predict(X_test)

In [21]:
metrics.roc_auc_score(y_test, rnd_frst_predictions)

0.587351091458601

# Градиентный бустинг

In [23]:
#lgbm_cl = LGBMClassifier(n_estimators=670, num_leaves=65, max_depth=40)
xgb_cl = XGBClassifier(n_jobs =4, max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)

In [24]:
#lgbm_cl.fit(X_train, y_train)
xgb_cl.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0,
              learning_rate=0.09, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=670, n_jobs=4,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [25]:
#lgbm_cl_predictions = lgbm_cl.predict(X_test)
xgb_cl_predictions = xgb_cl.predict(X_test)

In [26]:
#print(metrics.roc_auc_score(y_test, lgbm_cl_predictions))
print(metrics.roc_auc_score(y_test, xgb_cl_predictions))

0.6468448849319197



## Оценим с помощью вероятности и попробуем выбрать порог

In [197]:
#вероятность того, что объект принадлежит 1-му классу:
lgbm_cl_proba = lgbm_cl.predict_proba(X_test)[:,1]
xgb_cl_proba = xgb_cl.predict_proba(X_test)[:,1]

In [210]:
#укажите порог
T = 0.5
lgbm_cl_probabilities = lgbm_cl_proba > T
xgb_cl_probabilities = xgb_cl_proba > T

In [211]:
print(metrics.roc_auc_score(y_test, lgbm_cl_probabilities))
print(metrics.roc_auc_score(y_test, xgb_cl_probabilities))

0.8171836114469593
0.782119810754145


# Определим функции для каждого алгоритма, которые будут возвращать список фамилий и список предсказаний

In [29]:
def derevo(X_test,df):
    d3 = DecisionTreeClassifier()
    d3.fit(X_train, y_train)
    d3_predictions = d3.predict(X_test)
    return(list(df.surname),d3_predictions)       

In [30]:
def bagging(X_test,df):
    d3_bagging = BaggingClassifier(d3, random_state=1)
    d3_bagging.fit(X_train, y_train)
    d3_bagging_predictions = d3_bagging.predict(X_test)
    #print(metrics.roc_auc_score(y_test, d3_bagging_predictions))
    return list(df.surname),d3_bagging_predictions

In [31]:
def random_forest(X_test,df):
    rnd_frst = RandomForestClassifier()
    rnd_frst.fit(X_train, y_train)
    rnd_frst_predictions = rnd_frst.predict(X_test)
    return list(df.surname),rnd_frst_predictions

In [32]:
def log_regr(X_test,df):
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    log_reg_predictions = log_reg.predict(X_test)
    return list(df.surname),log_reg_predictions

In [33]:
def boost_xgb(X_test,df):
    xgb_cl = XGBClassifier()
    xgb_cl.fit(X_train, y_train)
    xgb_cl_predictions = xgb_cl.predict(X_test)
    return list(df.surname),xgb_cl_predictions

In [34]:
def boost_lgbm(X_test,df):
    lgbm_cl = LGBMClassifier()
    lgbm_cl.fit(X_train, y_train)
    lgbm_cl_predictions = lgbm_cl.predict(X_test)
    return list(df.surname),lgbm_cl_predictions

# Запишем файл ответов

In [27]:
features=list(df_test.columns)[2:]
X_test =[]
for name in features:
    X_test.append(df_test[name].values)
X_test = np.array(X_test)
X_test = np.transpose(X_test)

## Логистическая регрессия

In [46]:
log_reg = LogisticRegression()
log_reg.fit(X, y)
test_predictions = log_reg.predict(X_test)

## Бустинг

In [12]:
lgbm_cl.fit(X, y)
test_predictions = lgbm_cl.predict(X_test)

In [46]:
#это для вероятности
test_predictions_ans = list(map(float, test_predictions[:,0]))

In [42]:
xgb_cl.fit(X,y)
test_predictions = xgb_cl.predict_proba(X_test)

In [48]:
with open('xgb_boost_cv_proba_0.txt', 'w', encoding='utf8') as file_handler:
    file_handler.write('Id,Answer\n')
    for i in range(len(test_predictions_ans)):
        file_handler.write(str(i)+','+str(test_predictions_ans[i])+'\n')
file_handler.close()

In [35]:
def validate(x , y):
    model = XGBClassifier(n_jobs=4,max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

In [36]:
validate(X, y)

0.8690317550939675 0.0024336891947813423 



# catboost


In [48]:
cat_features = np.where(train_full.dtypes == 'object')[0].tolist()

In [49]:
ctb = CatBoostClassifier(random_seed=17, iterations=2000, loss_function='CrossEntropy', eval_metric='AUC')
ctb.fit(train_full, train['target'], cat_features=cat_features);

Learning rate set to 0.033391
0:	total: 155ms	remaining: 5m 10s
1:	total: 311ms	remaining: 5m 10s
2:	total: 454ms	remaining: 5m 2s
3:	total: 602ms	remaining: 5m
4:	total: 746ms	remaining: 4m 57s
5:	total: 889ms	remaining: 4m 55s
6:	total: 1.03s	remaining: 4m 54s
7:	total: 1.19s	remaining: 4m 55s
8:	total: 1.32s	remaining: 4m 53s
9:	total: 1.48s	remaining: 4m 54s
10:	total: 1.63s	remaining: 4m 54s
11:	total: 1.77s	remaining: 4m 54s
12:	total: 1.92s	remaining: 4m 53s
13:	total: 2.07s	remaining: 4m 53s
14:	total: 2.2s	remaining: 4m 50s
15:	total: 2.34s	remaining: 4m 50s
16:	total: 2.49s	remaining: 4m 50s
17:	total: 2.62s	remaining: 4m 48s
18:	total: 2.75s	remaining: 4m 47s
19:	total: 2.9s	remaining: 4m 46s
20:	total: 3.03s	remaining: 4m 45s
21:	total: 3.17s	remaining: 4m 45s
22:	total: 3.3s	remaining: 4m 43s
23:	total: 3.47s	remaining: 4m 45s
24:	total: 3.61s	remaining: 4m 44s
25:	total: 3.76s	remaining: 4m 45s
26:	total: 3.89s	remaining: 4m 44s
27:	total: 4.04s	remaining: 4m 44s
28:	tota

235:	total: 31.5s	remaining: 3m 55s
236:	total: 31.6s	remaining: 3m 55s
237:	total: 31.8s	remaining: 3m 55s
238:	total: 31.9s	remaining: 3m 55s
239:	total: 32s	remaining: 3m 54s
240:	total: 32.1s	remaining: 3m 54s
241:	total: 32.3s	remaining: 3m 54s
242:	total: 32.4s	remaining: 3m 54s
243:	total: 32.5s	remaining: 3m 54s
244:	total: 32.7s	remaining: 3m 53s
245:	total: 32.8s	remaining: 3m 53s
246:	total: 32.9s	remaining: 3m 53s
247:	total: 33.1s	remaining: 3m 53s
248:	total: 33.2s	remaining: 3m 53s
249:	total: 33.3s	remaining: 3m 53s
250:	total: 33.5s	remaining: 3m 53s
251:	total: 33.6s	remaining: 3m 53s
252:	total: 33.8s	remaining: 3m 53s
253:	total: 33.9s	remaining: 3m 53s
254:	total: 34s	remaining: 3m 52s
255:	total: 34.2s	remaining: 3m 52s
256:	total: 34.3s	remaining: 3m 52s
257:	total: 34.4s	remaining: 3m 52s
258:	total: 34.6s	remaining: 3m 52s
259:	total: 34.7s	remaining: 3m 52s
260:	total: 34.8s	remaining: 3m 52s
261:	total: 35s	remaining: 3m 51s
262:	total: 35.1s	remaining: 3m 51

465:	total: 1m 1s	remaining: 3m 22s
466:	total: 1m 1s	remaining: 3m 22s
467:	total: 1m 1s	remaining: 3m 22s
468:	total: 1m 1s	remaining: 3m 22s
469:	total: 1m 2s	remaining: 3m 21s
470:	total: 1m 2s	remaining: 3m 21s
471:	total: 1m 2s	remaining: 3m 21s
472:	total: 1m 2s	remaining: 3m 21s
473:	total: 1m 2s	remaining: 3m 21s
474:	total: 1m 2s	remaining: 3m 21s
475:	total: 1m 2s	remaining: 3m 21s
476:	total: 1m 2s	remaining: 3m 20s
477:	total: 1m 3s	remaining: 3m 20s
478:	total: 1m 3s	remaining: 3m 20s
479:	total: 1m 3s	remaining: 3m 20s
480:	total: 1m 3s	remaining: 3m 20s
481:	total: 1m 3s	remaining: 3m 20s
482:	total: 1m 3s	remaining: 3m 19s
483:	total: 1m 3s	remaining: 3m 19s
484:	total: 1m 3s	remaining: 3m 19s
485:	total: 1m 4s	remaining: 3m 19s
486:	total: 1m 4s	remaining: 3m 19s
487:	total: 1m 4s	remaining: 3m 19s
488:	total: 1m 4s	remaining: 3m 19s
489:	total: 1m 4s	remaining: 3m 19s
490:	total: 1m 4s	remaining: 3m 18s
491:	total: 1m 4s	remaining: 3m 18s
492:	total: 1m 4s	remaining:

691:	total: 1m 30s	remaining: 2m 51s
692:	total: 1m 30s	remaining: 2m 51s
693:	total: 1m 30s	remaining: 2m 50s
694:	total: 1m 30s	remaining: 2m 50s
695:	total: 1m 31s	remaining: 2m 50s
696:	total: 1m 31s	remaining: 2m 50s
697:	total: 1m 31s	remaining: 2m 50s
698:	total: 1m 31s	remaining: 2m 50s
699:	total: 1m 31s	remaining: 2m 50s
700:	total: 1m 31s	remaining: 2m 49s
701:	total: 1m 31s	remaining: 2m 49s
702:	total: 1m 31s	remaining: 2m 49s
703:	total: 1m 32s	remaining: 2m 49s
704:	total: 1m 32s	remaining: 2m 49s
705:	total: 1m 32s	remaining: 2m 49s
706:	total: 1m 32s	remaining: 2m 49s
707:	total: 1m 32s	remaining: 2m 48s
708:	total: 1m 32s	remaining: 2m 48s
709:	total: 1m 32s	remaining: 2m 48s
710:	total: 1m 32s	remaining: 2m 48s
711:	total: 1m 33s	remaining: 2m 48s
712:	total: 1m 33s	remaining: 2m 48s
713:	total: 1m 33s	remaining: 2m 48s
714:	total: 1m 33s	remaining: 2m 48s
715:	total: 1m 33s	remaining: 2m 47s
716:	total: 1m 33s	remaining: 2m 47s
717:	total: 1m 33s	remaining: 2m 47s
7

914:	total: 1m 59s	remaining: 2m 21s
915:	total: 1m 59s	remaining: 2m 21s
916:	total: 1m 59s	remaining: 2m 21s
917:	total: 1m 59s	remaining: 2m 21s
918:	total: 2m	remaining: 2m 21s
919:	total: 2m	remaining: 2m 21s
920:	total: 2m	remaining: 2m 20s
921:	total: 2m	remaining: 2m 20s
922:	total: 2m	remaining: 2m 20s
923:	total: 2m	remaining: 2m 20s
924:	total: 2m	remaining: 2m 20s
925:	total: 2m	remaining: 2m 20s
926:	total: 2m 1s	remaining: 2m 20s
927:	total: 2m 1s	remaining: 2m 20s
928:	total: 2m 1s	remaining: 2m 19s
929:	total: 2m 1s	remaining: 2m 19s
930:	total: 2m 1s	remaining: 2m 19s
931:	total: 2m 1s	remaining: 2m 19s
932:	total: 2m 2s	remaining: 2m 19s
933:	total: 2m 2s	remaining: 2m 19s
934:	total: 2m 2s	remaining: 2m 19s
935:	total: 2m 2s	remaining: 2m 19s
936:	total: 2m 2s	remaining: 2m 19s
937:	total: 2m 2s	remaining: 2m 19s
938:	total: 2m 3s	remaining: 2m 19s
939:	total: 2m 3s	remaining: 2m 18s
940:	total: 2m 3s	remaining: 2m 18s
941:	total: 2m 3s	remaining: 2m 18s
942:	total: 

1139:	total: 2m 29s	remaining: 1m 52s
1140:	total: 2m 29s	remaining: 1m 52s
1141:	total: 2m 29s	remaining: 1m 52s
1142:	total: 2m 29s	remaining: 1m 52s
1143:	total: 2m 29s	remaining: 1m 52s
1144:	total: 2m 29s	remaining: 1m 51s
1145:	total: 2m 30s	remaining: 1m 51s
1146:	total: 2m 30s	remaining: 1m 51s
1147:	total: 2m 30s	remaining: 1m 51s
1148:	total: 2m 30s	remaining: 1m 51s
1149:	total: 2m 30s	remaining: 1m 51s
1150:	total: 2m 30s	remaining: 1m 51s
1151:	total: 2m 30s	remaining: 1m 51s
1152:	total: 2m 31s	remaining: 1m 50s
1153:	total: 2m 31s	remaining: 1m 50s
1154:	total: 2m 31s	remaining: 1m 50s
1155:	total: 2m 31s	remaining: 1m 50s
1156:	total: 2m 31s	remaining: 1m 50s
1157:	total: 2m 31s	remaining: 1m 50s
1158:	total: 2m 31s	remaining: 1m 50s
1159:	total: 2m 31s	remaining: 1m 50s
1160:	total: 2m 32s	remaining: 1m 49s
1161:	total: 2m 32s	remaining: 1m 49s
1162:	total: 2m 32s	remaining: 1m 49s
1163:	total: 2m 32s	remaining: 1m 49s
1164:	total: 2m 32s	remaining: 1m 49s
1165:	total:

1355:	total: 2m 57s	remaining: 1m 24s
1356:	total: 2m 57s	remaining: 1m 24s
1357:	total: 2m 57s	remaining: 1m 23s
1358:	total: 2m 57s	remaining: 1m 23s
1359:	total: 2m 57s	remaining: 1m 23s
1360:	total: 2m 57s	remaining: 1m 23s
1361:	total: 2m 58s	remaining: 1m 23s
1362:	total: 2m 58s	remaining: 1m 23s
1363:	total: 2m 58s	remaining: 1m 23s
1364:	total: 2m 58s	remaining: 1m 22s
1365:	total: 2m 58s	remaining: 1m 22s
1366:	total: 2m 58s	remaining: 1m 22s
1367:	total: 2m 58s	remaining: 1m 22s
1368:	total: 2m 58s	remaining: 1m 22s
1369:	total: 2m 59s	remaining: 1m 22s
1370:	total: 2m 59s	remaining: 1m 22s
1371:	total: 2m 59s	remaining: 1m 22s
1372:	total: 2m 59s	remaining: 1m 21s
1373:	total: 2m 59s	remaining: 1m 21s
1374:	total: 2m 59s	remaining: 1m 21s
1375:	total: 2m 59s	remaining: 1m 21s
1376:	total: 2m 59s	remaining: 1m 21s
1377:	total: 3m	remaining: 1m 21s
1378:	total: 3m	remaining: 1m 21s
1379:	total: 3m	remaining: 1m 21s
1380:	total: 3m	remaining: 1m 20s
1381:	total: 3m	remaining: 1

1579:	total: 3m 26s	remaining: 54.8s
1580:	total: 3m 26s	remaining: 54.7s
1581:	total: 3m 26s	remaining: 54.6s
1582:	total: 3m 26s	remaining: 54.5s
1583:	total: 3m 26s	remaining: 54.3s
1584:	total: 3m 26s	remaining: 54.2s
1585:	total: 3m 27s	remaining: 54.1s
1586:	total: 3m 27s	remaining: 53.9s
1587:	total: 3m 27s	remaining: 53.8s
1588:	total: 3m 27s	remaining: 53.7s
1589:	total: 3m 27s	remaining: 53.5s
1590:	total: 3m 27s	remaining: 53.4s
1591:	total: 3m 27s	remaining: 53.3s
1592:	total: 3m 28s	remaining: 53.1s
1593:	total: 3m 28s	remaining: 53s
1594:	total: 3m 28s	remaining: 52.9s
1595:	total: 3m 28s	remaining: 52.7s
1596:	total: 3m 28s	remaining: 52.6s
1597:	total: 3m 28s	remaining: 52.5s
1598:	total: 3m 28s	remaining: 52.4s
1599:	total: 3m 28s	remaining: 52.2s
1600:	total: 3m 28s	remaining: 52.1s
1601:	total: 3m 29s	remaining: 52s
1602:	total: 3m 29s	remaining: 51.8s
1603:	total: 3m 29s	remaining: 51.7s
1604:	total: 3m 29s	remaining: 51.6s
1605:	total: 3m 29s	remaining: 51.4s
1606:

1803:	total: 3m 55s	remaining: 25.5s
1804:	total: 3m 55s	remaining: 25.4s
1805:	total: 3m 55s	remaining: 25.3s
1806:	total: 3m 55s	remaining: 25.2s
1807:	total: 3m 55s	remaining: 25s
1808:	total: 3m 55s	remaining: 24.9s
1809:	total: 3m 55s	remaining: 24.8s
1810:	total: 3m 56s	remaining: 24.6s
1811:	total: 3m 56s	remaining: 24.5s
1812:	total: 3m 56s	remaining: 24.4s
1813:	total: 3m 56s	remaining: 24.2s
1814:	total: 3m 56s	remaining: 24.1s
1815:	total: 3m 56s	remaining: 24s
1816:	total: 3m 56s	remaining: 23.9s
1817:	total: 3m 57s	remaining: 23.7s
1818:	total: 3m 57s	remaining: 23.6s
1819:	total: 3m 57s	remaining: 23.5s
1820:	total: 3m 57s	remaining: 23.3s
1821:	total: 3m 57s	remaining: 23.2s
1822:	total: 3m 57s	remaining: 23.1s
1823:	total: 3m 57s	remaining: 22.9s
1824:	total: 3m 57s	remaining: 22.8s
1825:	total: 3m 58s	remaining: 22.7s
1826:	total: 3m 58s	remaining: 22.6s
1827:	total: 3m 58s	remaining: 22.4s
1828:	total: 3m 58s	remaining: 22.3s
1829:	total: 3m 58s	remaining: 22.2s
1830:

In [50]:
sub = pd.read_csv('linear_ans_example.txt')
sub = sub.reset_index(drop=True)
sub['Answer'] = ctb.predict_proba(test_full)[:,1]
sub.to_csv('n_3__w2v_200__sg_1__2k_iters_so_manyy_dop_features.txt', columns=['Id', 'Answer'], index=False)