#Решение соревнования https://www.kaggle.com/c/secondname/

Нужно построить классификатор, определяющий, является ли слово фамилией. Для оценки качества используется площадь под ROC-кривой (AUC), так как классы сильно несбалансированны.
#### Maksim Pikalov, Public - 0.95201

Поскольку проект учебный импортируем все возможные библиотеки, чтобы выбрать наиболее подходящие варианты:

In [0]:
!pip install pymorphy2
!pip install catboost

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |████████████████████████████████| 51kB 2.6MB/s 
[?25hCollecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 6.0MB/s 
Collecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Installing collected packages: pymorphy2-dicts, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.8 pymorphy2-dicts-2.4.393442.3710985
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/b1/61/2b8106c8870601671d99ca94d8

In [0]:
import pandas as pd
import numpy as np
import pymorphy2
from sklearn import metrics

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression # для сравнения с линейной моделью
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import gensim
from gensim.models import word2vec
from tqdm._tqdm_notebook import tqdm_notebook
import gc

from textwrap import wrap

Поскольку в данной задаче признаки не даны в условии, то попробуем сгенерировать их самостоятельно.

## Векторизуем данные с помощью библиотеки word2vec, чтобы добавить признаки

Читаем dataset'ы train & test и объединяем в один для векторизации word2vec:

In [0]:
fname = r'linear_train.txt'
df = pd.read_csv(fname, names=['surname', 'target'])
df_test = pd.read_csv('linear_test.txt',names=['surname', 'target'])
df_for_w2v = pd.concat([df, df_test])

In [0]:
surname_list_test = list(df_test['surname'].values)
surname_list_train = list(df['surname'].values)

Разобъем наши слова на n-граммы и создадим словарь токенов:

In [0]:
#укажите количество символов, на которое будем разбивать слова
n = 3

surname_tokens = []
tokens_set = set()
for surname in surname_list_train+surname_list_test:
    tmp = wrap(surname, n)
    surname_tokens.append(tmp)
    for i in tmp:
        tokens_set.add(i)

Обучим модель word2vec на полученных токенах:

In [0]:
size_model=200
model = gensim.models.Word2Vec(surname_tokens,
                               sg=1,
                               size=size_model,
                               workers=8,
                               min_count=1,
                               window=4)

Нормируем вектора каждого токена:

In [0]:
surname_vectors = []
for sur in surname_tokens:
    summ = 0
    for i in range(len(sur)):
        summ += model[sur[i]]
    surname_vectors.append(summ/len(sur))
surname_vectors = (np.array(surname_vectors)).transpose()

## Добавим полученные признаки в dataset

In [0]:
for col_name, obj in zip(range(size_model), surname_vectors):
    df_for_w2v[str(col_name)] = obj

Посмотрим на получившийся датасет:

In [0]:
df_for_w2v.head()

Unnamed: 0,surname,target,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,Аалтонен,1.0,-0.126881,0.193165,-0.140195,0.053468,-0.059329,0.094659,0.036002,0.227655,-0.102498,0.111511,-0.071624,0.000995,0.152599,-0.056503,0.221251,-0.237311,-0.015136,-0.026903,-0.033764,0.088953,0.147848,0.085745,-0.09836,0.036319,0.041965,0.033387,0.004927,-0.042142,-0.127081,-0.010161,-0.027054,-0.02494,-0.169333,-0.141313,-0.041203,-0.20652,-0.011129,0.083747,...,0.132046,-0.043649,0.250864,-0.113117,-0.093216,-0.130815,-0.040618,0.120134,-0.038645,-0.118321,0.109578,-0.000306,0.028083,-0.059882,-0.00738,0.146751,-0.082356,-0.010775,0.399233,0.13798,-0.076179,-0.115567,0.145228,-0.053926,-0.012619,-0.145737,-0.155369,0.162389,0.077197,0.025116,0.067457,0.083182,-0.028105,0.014866,-0.114338,-0.219512,0.119048,0.082883,-0.069787,0.195463
1,Аар,0.0,-0.080772,0.052016,-0.160002,0.021338,0.063651,0.011947,-0.074008,0.205771,-0.091549,-0.025624,-0.126751,0.016024,0.064543,-0.021126,0.089526,-0.105284,0.049067,0.114424,0.086479,0.019575,0.007043,0.140352,-0.068682,0.076958,0.099506,0.014101,0.039611,-0.14295,0.01105,-0.039726,0.033159,0.040547,-0.071989,-0.070351,-0.051532,-0.125,0.040993,-0.138616,...,0.014623,-0.069318,0.022084,-0.040715,-0.018921,0.047032,-0.10311,0.131538,-0.020995,-0.147414,0.005381,-0.008031,-0.061958,-0.003336,0.001869,0.119736,-0.088922,0.028619,0.154339,0.063582,-0.12127,-0.077987,0.143251,-0.029302,0.027491,-0.102904,-0.017842,0.044877,0.0365,-0.054464,-0.096489,-0.002123,-0.030909,0.110391,0.012372,-0.01243,0.157062,0.088149,-0.05374,0.140645
2,Аарон,0.0,-0.045483,0.086226,-0.138923,0.032169,-0.021871,0.06738,0.018325,0.250136,0.060061,0.039556,-0.109976,0.019396,0.218807,0.037863,0.159191,-0.158129,0.018382,-0.050888,0.041621,0.213312,-0.008611,0.111581,-0.014652,0.129909,0.149864,-0.015612,-0.006809,-0.10152,-0.089729,0.06349,-0.083011,-0.038346,-0.099085,-0.149796,-0.026407,-0.175994,-0.033388,-0.136453,...,0.146057,-0.293466,0.139456,-0.142249,0.008939,-0.109554,-0.019694,0.107862,-0.014111,-0.34238,0.057851,0.102186,-0.078472,-0.157357,-0.196779,0.061705,-0.186368,-0.101563,0.309247,0.015934,-0.089883,-0.135571,-0.003885,-0.146919,0.069419,-0.139836,-0.151164,0.004609,-0.0341,-0.08826,-0.169596,0.129118,0.016354,0.058473,0.00403,-0.117246,0.1926,0.12541,-0.093792,0.206888
3,ААРОН,0.0,-0.052202,0.036116,-0.103213,0.03184,-0.041998,0.011425,-0.048646,0.135283,-0.055702,0.000457,-0.000986,0.098825,0.033322,0.036945,0.074502,-0.119927,-0.053707,0.125588,0.013175,0.114303,0.01565,0.133331,-0.083285,0.021404,0.053206,-0.093577,-0.101801,-0.135361,-0.088055,0.061581,0.015932,-0.039066,0.017647,-0.003945,0.062306,0.009184,0.013805,-0.161712,...,-0.001432,-0.024411,0.175622,-0.01764,0.042979,-0.092446,0.035871,0.033027,0.101192,-0.040471,-0.020088,0.024735,-0.070389,0.034896,-0.0606,0.183898,-0.041574,-0.007211,0.141584,0.024289,0.067908,-0.125453,0.074177,-0.006903,-0.032807,-0.094309,0.025117,-0.136672,-0.005839,-0.014987,-0.003359,0.039486,0.01003,0.102346,0.056408,-0.04788,0.081251,0.119332,-0.066289,0.048099
4,Аарона,0.0,-0.003901,0.207774,-0.274087,-0.062856,0.072914,-0.030021,0.068625,0.305421,0.040595,0.079357,-0.168823,0.057694,0.533695,0.062513,0.243652,0.02505,0.123149,-0.051126,0.22166,0.063741,-0.122191,0.330225,0.02297,0.212448,0.222876,0.025225,0.147668,0.049546,-0.061486,-0.055336,-0.00371,0.05278,0.034546,-0.12022,-0.221384,-0.132266,-0.046504,-0.148413,...,0.2042,-0.289162,0.050188,-0.183581,-0.09915,0.076052,-0.065948,0.159439,-0.023317,-0.439599,-0.006806,0.133672,0.046212,-0.237398,-0.323297,-0.145989,-0.170266,-0.195154,0.285267,-0.141633,-0.172047,-0.258864,-0.055844,0.03522,0.204696,-0.216512,-0.224943,-0.096119,-0.124973,-0.153802,-0.227883,0.239974,0.136791,-0.027239,0.218738,0.163633,0.175529,0.087085,-0.016801,0.184302


## Вручную сгенерирум дополнительные признаки

Сформируем функции, которые будут определять:

1) начинается ли слово с заглавной буквы,

2) наличие в слове характерных для фамилий суффиксов и окончаний,

3) количество слогов и гласных/согласных букв в слове.

In [0]:
skonca=5
def ovev(Famils):
    num=[]
    for els in Famils:    
        if ('ов' in els[-skonca:]) or ('ОВ' in els[-skonca:]) or ('ев' in els[-skonca:]) or ('ЕВ' in els[-skonca:]) or ('ёв' in els[-skonca:]) or ('ЁВ' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))
    
def inyn(Famils):
    num=[]
    for els in Famils:
        if ('ин' in els[-skonca:]) or ('ИН' in els[-skonca:]) or ('ын' in els[-skonca:]) or ('ЫН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))
    
def skck(Famils):
    num=[]
    for els in Famils:
        if ('ск' in els[-skonca:]) or ('СК' in els[-skonca:]) or('цк' in els[-skonca:]) or ('ЦК' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def bigl(Famils):
    num=[]
    for els in Famils:
        if els[0] in 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯ':
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def glasn2(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==2:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def glasn3(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==3:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def glasn4(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        if j==4:
            num.append(1)
        else:
            num.append(0)
        j=0
    return(np.array(num))

def slogs(Famils):
    num=[]
    j=0
    let='аоеёуыяэюи'
    for els in Famils:
        for lets in els:
            if lets in let or lets in let.upper() :
                j+=1
        num.append(j)
        j=0
    return(np.array(num))

def lens(Famils):
    num=[]
    for els in Famils:    
        num.append(len(els))
    return(np.array(num))

def anyan(Famils):
    num=[]
    for els in Famils:
        if ('ан' in els[-skonca:]) or ('АН' in els[-skonca:]) or ('ян' in els[-skonca:]) or ('ЯН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def nko(Famils):
    num=[]
    for els in Famils:
        if ('нко' in els[-skonca:]) or ('нк' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def ayn(Famils):
    num=[]
    for els in Famils:
        if ('айн' in els[-skonca:]) or ('АЙН' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))

def rs(Famils):
    num=[]
    for els in Famils:
        if ('рс' in els[-skonca:]) or ('РС' in els[-skonca:]):
            num.append(1)
        else:
            num.append(0)
    return(np.array(num))



In [0]:
df_for_w2v['ovev']=ovev(df_for_w2v.surname)
df_for_w2v['inyn']=inyn(df_for_w2v.surname)
df_for_w2v['skck']=skck(df_for_w2v.surname)
#df_for_w2v['bigl']=bigl(df_for_w2v.surname)
# df_sorted_cut['glasn2']=glasn2(df_sorted_cut.surname)
# df_sorted_cut['glasn3']=glasn3(df_sorted_cut.surname)
# df_sorted_cut['glasn4']=glasn4(df_sorted_cut.surname)
df_for_w2v['slogs']=slogs(df_for_w2v.surname)
#df_for_w2v['lens'] = lens(df_for_w2v.surname)
#df_for_w2v['anyan']=anyan(df_for_w2v.surname)
df_for_w2v['nko']=nko(df_for_w2v.surname)
#df_for_w2v['ayn'] = ayn(df_for_w2v.surname)
#df_for_w2v['rs']=rs(df_for_w2v.surname)

In [0]:
volwes = 'аеёиоуыэюя'
consonant = 'бвгджзйклмнпрстфхцчшщъьа'
alphabet = set(volwes) | set(consonant)

good_ends = ['цкий',  'ский', 'цкая', 'ская',
             'нен', 'ына', 'ина', 'ёва', 'ева', 'ова',
             'ым',  'на', 'ых', 'их', 'ын', 'ин', 'ёв', 'ев', 'ов']

In [0]:
def symbols(sentence , n):
    res = []
    for s in sentence:
        res.extend(list(s.lower()))
    return set(res)


all_symbols = symbols(df_for_w2v.surname.values , 1)
bad_symbols = all_symbols ^ alphabet

In [0]:
for s in tqdm_notebook(good_ends):
    df_for_w2v['last_' + s] = df_for_w2v['surname'].apply(lambda x: x[-len(s):].lower() == s).astype('int8')

for s in tqdm_notebook(alphabet):
    df_for_w2v['count_' + s] = df_for_w2v['surname'].apply(lambda x: x.lower().count(s)).astype('int8')

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [0]:
n_of_gramms = [1,2] 
index_smoof = [4,6]
for j , n in tqdm_notebook(zip(n_of_gramms , index_smoof) , total=len(n_of_gramms)):
    for i in range(n):
        col_forward = 'forward_' +str(j) + '_' + str(i)
        df_for_w2v[col_forward] = df_for_w2v['surname'].apply(str.lower).apply(lambda x: x[i:i+j] if len(x) > i + j - 1 else 'nan')
        df_for_w2v[col_forward] = LabelEncoder().fit_transform(df_for_w2v[col_forward])

        col_reverse = 'reverse_' +str(j) + '_' + str(i)
        df_for_w2v[col_reverse] = df_for_w2v['surname'].apply(str.lower).apply(lambda x: x[::-1][i:i+j]  if len(x) > i + j - 1 else 'nan')
        df_for_w2v[col_reverse] = LabelEncoder().fit_transform(df_for_w2v[col_reverse])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [0]:
def vowels_count(x):
    return len([s for s in x.lower() if s in volwes])

def consonant_count(x):
    return len([s for s in x.lower() if s in consonant])

def divide_vov_by_cons(x):
    return vowels_count(x)/(consonant_count(x) + 0.001)

def bad_count(x):
    return len([s for s in x.lower() if s in bad_symbols])

def have_good_end(x):
    if (x[-2:] in good_ends) or (x[-3:] in good_ends) or (x[-4:] in good_ends):
        return 1
    return 0

func = [str.isupper , str.istitle , len , vowels_count , consonant_count, divide_vov_by_cons, bad_count , have_good_end]
columns = ['isupper' , 'istitle' , 'len' , 'vowels_count' , 'consonant_count' , 'divide_vov_by_cons' ,'bad_count' , 'have_good_end']
for f ,col in tqdm_notebook(zip(func, columns) , total = len(func)):
    df_for_w2v[col] = df_for_w2v['surname'].apply(f).astype('int8')

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




Попробуем добавить ещё признаков, применив библиотеку pymorphy2 и проанализировав слова на соответствие некоторым категориям.

In [0]:
morph = pymorphy2.MorphAnalyzer()

def name_score(word):
    for p in morph.parse(word):
        if 'Name' in p.tag:
            return p.score
    return 0

def surn_score(word):
    for p in morph.parse(word):
        if 'Surn' in p.tag:
            return p.score
    return 0

df_for_w2v['pymorphy_word_is_known'] = df_for_w2v['surname'].apply(morph.word_is_known).astype('int8')
df_for_w2v['pymorphy_count_in_tag'] = df_for_w2v['surname'].apply(lambda x: len(morph.tag(x))).astype('int8')
df_for_w2v['pymorphy_score'] = df_for_w2v['surname'].apply(lambda x: morph.parse(x)[0].score)
df_for_w2v['pymorphy'] = df_for_w2v['surname'].apply(lambda x: morph.tag(x)[0])

df_for_w2v['pymorphy_animacy'] = df_for_w2v['pymorphy'].apply(lambda x: x.animacy)
df_for_w2v['pymorphy_POS'] = df_for_w2v['pymorphy'].apply(lambda x: x.POS)
df_for_w2v['pymorphy_case'] = df_for_w2v['pymorphy'].apply(lambda x: x.case)
df_for_w2v['pymorphy_number'] = df_for_w2v['pymorphy'].apply(lambda x: x.number)
df_for_w2v['pymorphy_gender'] = df_for_w2v['pymorphy'].apply(lambda x: x.gender)

df_for_w2v['pymorphy_name_score'] = df_for_w2v['surname'].apply(name_score)
df_for_w2v['pymorphy_surn_score'] = df_for_w2v['surname'].apply(surn_score)

columns_to_one_hot = ['pymorphy' , 'pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    df_for_w2v[col] = LabelEncoder().fit_transform(list(df_for_w2v[col].fillna('nan')))

## Признаки добавлены, теперь будем разделять выборку, чтобы строить модель.

Разделим выборку на Train и Test и перемешаем, так как при генерации признаков все объекты были в одном датасете.

In [0]:
df_1 = df_for_w2v[df_for_w2v['target']==1]
df_0 = df_for_w2v[df_for_w2v['target']==0]
df_without_1 = df_for_w2v[df_for_w2v['target'] != 1]
df_test = df_without_1[df_without_1['target'] != 0]
df_sorted_cut = pd.concat([df_1, df_0],
                          ignore_index=True).sample(frac=1)

In [0]:
features=list(df_sorted_cut.columns)[2:]

train=df_sorted_cut
train_full = df_sorted_cut[features]

test_full = df_test[features]

Выделим матрицу признаков и ответов:

In [0]:
X =[]
for name in features:
    X.append(df_sorted_cut[name].values)
X = np.array(X)
X = np.transpose(X)
y = df_sorted_cut['target'].values

Для оценки качества полученной модели разобъем Train на две подвыборки:

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

Попробуем определить базовый скор сначала при помощи простых алгоритмов машинного обучения

## Логистическая регрессия

In [0]:
log_reg = LogisticRegression(penalty='l2', n_jobs=4, random_state=42,
                             class_weight='balanced',
                             C = 0.1, max_iter=50, solver='sag')

In [0]:
log_reg.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
log_reg_predictions = log_reg.predict(X_test)

In [0]:
metrics.roc_auc_score(y_test, log_reg_predictions)

0.771265054689698

Получили не такой уж плохой скор для логистической регрессии.

## Random Forest

In [0]:
rnd_frst = RandomForestClassifier(n_estimators=50, criterion='entropy')

In [0]:
rnd_frst.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
rnd_frst_predictions = rnd_frst.predict(X_test)

In [0]:
metrics.roc_auc_score(y_test, rnd_frst_predictions)

0.7422724018933353

Скор даже хуже, чем у лог. регрессии.

## Градиентные бустинги

Попробуем 3 самые популярные библиотеки для построения градиентного бустинга над решающими деревьями.

In [0]:
cat_features = np.where(train_full.dtypes == 'object')[0].tolist()

In [0]:
lgbm_cl = LGBMClassifier(n_estimators=670, num_leaves=65, max_depth=40)
xgb_cl = XGBClassifier(n_jobs =4, max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
ctb = CatBoostClassifier(random_seed=17, iterations=2000, loss_function='CrossEntropy', eval_metric='AUC')

In [0]:
lgbm_cl.fit(X_train, y_train)
xgb_cl.fit(X_train, y_train)
ctb.fit(train_full, train['target'], cat_features=cat_features);

In [0]:
lgbm_cl_predictions = lgbm_cl.predict_proba(X_test)
xgb_cl_predictions = xgb_cl.predict_proba(X_test)
ctb_cl_predictions = ctb.predict_proba(X_test)

In [0]:
print(metrics.roc_auc_score(y_test, lgbm_cl_predictions[:,1]))
print(metrics.roc_auc_score(y_test, xgb_cl_predictions[:,1]))
print(metrics.roc_auc_score(y_test, ctb_cl_predictions[:,1]))

0.9455420921412435
0.9471737344833157
0.9769571956671415


Видно, что Catboost показывает лучший результат, поэтому будем далее использовать его.

## Проверим результаты на кросс-валидации

In [0]:
def validate_xgb(x , y):
    model = XGBClassifier(n_jobs=4,max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

In [0]:
def validate_ctb(x , y):
  model = CatBoostClassifier(random_seed=17, iterations=2000, loss_function='CrossEntropy', eval_metric='AUC', silent=True)
  cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
  score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
  print (score.mean() , score.std() , '\n')

In [0]:
validate_ctb(X, y)

0.952162849206362 0.0014979998972925233 



Результат на кросс-валидации не стал сильно хуже, что говорит о том, что модель не переобучена и всё хорошо.

# Запишем файл ответов

In [0]:
with open('xgb_boost_cv_proba_0.txt', 'w', encoding='utf8') as file_handler:
    file_handler.write('Id,Answer\n')
    for i in range(len(test_predictions_ans)):
        file_handler.write(str(i)+','+str(test_predictions_ans[i])+'\n')
file_handler.close()

In [0]:
sub = pd.read_csv('linear_ans_example.txt')
sub = sub.reset_index(drop=True)
sub['Answer'] = ctb.predict_proba(test_full)[:,1]
sub.to_csv('n_3__w2v_200__sg_1__2k_iters_so_manyy_dop_features.txt', columns=['Id', 'Answer'], index=False)