In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

Загрузим часть данных без отложенной выборки.

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [3]:
X_train.shape

(30000, 230)

In [4]:
y_train.shape

(30000, 1)

Разобьем на числовые и категориальные признаки

In [5]:
numeric_data = X_train.iloc[:, :190].fillna(0)
categorial_data = X_train.iloc[:, 190:].fillna('0')

Преобразуем категориальные данные методом LabelEncoding, т.к. различных значений категориальных признаков огромное количество, и OneHotEncoding дает около 50к новых столбцов

In [6]:
categorial_data_encode = categorial_data.apply(LabelEncoder().fit_transform)

In [7]:
categorial_data_encode.head()

Unnamed: 0,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,0,194,24,0,20,0,20,3107,307,7420,...,4,1586,2,0,1,19,2,8,1,0
1,0,123,24,0,20,0,123,2197,633,0,...,4,2458,1,0,0,18,2,8,0,0
2,0,124,0,2,20,0,113,559,2081,4529,...,4,1048,1,0,2,19,2,2,2,0
3,0,212,24,0,20,0,210,360,779,0,...,4,658,1,0,0,4,2,8,0,0
4,0,222,24,0,20,0,112,2107,324,0,...,4,770,1,0,0,10,2,8,0,0


Объединим с числовыми признаками

In [8]:
X_train_encode = pd.concat([numeric_data, categorial_data_encode], axis=1)

In [9]:
X_train_encode.shape

(30000, 230)

In [10]:
X_train_encode.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,0.0,0.0,0.0,0.0,0.0,7553.0,7.0,0.0,0.0,0.0,...,4,1586,2,0,1,19,2,8,1,0
1,0.0,0.0,0.0,0.0,0.0,273.0,0.0,0.0,0.0,0.0,...,4,2458,1,0,0,18,2,8,0,0
2,0.0,0.0,0.0,0.0,0.0,1820.0,7.0,0.0,0.0,0.0,...,4,1048,1,0,2,19,2,2,2,0
3,0.0,0.0,0.0,0.0,0.0,1001.0,0.0,0.0,0.0,0.0,...,4,658,1,0,0,4,2,8,0,0
4,0.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0,...,4,770,1,0,0,10,2,8,0,0


Обучим SGDClassifier сначала по полной выборке, потом отдельно по числовым признакам, чтобы глянуть разницу.

In [11]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state = 17)

In [12]:
sgd_logit = SGDClassifier(random_state=17)

In [13]:
%%time
cv_scores = cross_val_score(sgd_logit, numeric_data, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores.mean())
cv_scores = cross_val_score(sgd_logit, numeric_data, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores.mean())

ROC-AUC 0.49371564938827894
average_precision 0.07752446802718835
Wall time: 4.46 s


In [14]:
%%time 
cv_scores2 = cross_val_score(sgd_logit, X_train_encode, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores2.mean())
cv_scores2 = cross_val_score(sgd_logit, X_train_encode, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores2.mean())

ROC-AUC 0.49743423722830704
average_precision 0.07792582489816416
Wall time: 3.8 s


Как видим, кодированные категориальные признаки пусть и незначительно, но увеличивают качество, которое конечно очень низкое.

Обучим Ridge классификатор

In [15]:
logit = Ridge()

In [16]:
%%time 
cv_scores3 = cross_val_score(logit, X_train_encode, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores3.mean())
cv_scores3 = cross_val_score(logit, X_train_encode, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores3.mean())

ROC-AUC 0.6528557738172904
average_precision 0.1360482890042249
Wall time: 1.12 s


Обучим случайный лес

In [17]:
tree = RandomForestClassifier()

In [18]:
%%time 
cv_scores4 = cross_val_score(tree, X_train_encode, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores4.mean())
cv_scores4 = cross_val_score(tree, X_train_encode, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores4.mean())

ROC-AUC 0.6653677345193806
average_precision 0.16239361502735744
Wall time: 22.1 s


In [19]:
xgboost = xgb.XGBClassifier() 

In [20]:
%%time 
cv_scores5 = cross_val_score(xgboost, X_train_encode, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores5.mean())
cv_scores5 = cross_val_score(xgboost, X_train_encode, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores5.mean())

ROC-AUC 0.6947377050813417
average_precision 0.17933245087280883
Wall time: 49.8 s


Как видим по результатам - стохастический градиентный спуск вообще не годится на наших данных, остальные дают примерно похожие результаты, градиентный бустинг показал себя лучше всех.

In [21]:
xgboost2 = xgb.XGBRegressor() 

In [22]:
%%time 
cv_scores5 = cross_val_score(xgboost2, X_train_encode, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
print('ROC-AUC', cv_scores5.mean())
cv_scores5 = cross_val_score(xgboost2, X_train_encode, y_train, cv=cv, scoring='average_precision', n_jobs=-1)
print('average_precision', cv_scores5.mean())

ROC-AUC 0.6748562369125879
average_precision 0.1626396315955267
Wall time: 51.8 s
