In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from load_data import get_data

In [2]:
x_train, y_train, x_pred = get_data()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  alldf = pd.concat([train_df,test_df])


Training set size: (24584, 57)


In [3]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet, Lars, LassoLars, OrthogonalMatchingPursuit, BayesianRidge, SGDClassifier

In [4]:
def train_model(model):
    cv_results = cross_validate(model,
                                x_train, y_train,
                                scoring=['roc_auc'], cv=5, n_jobs=4)
    auc = cv_results['test_roc_auc'].mean()
    print(f"{type(model)}: {auc}")
    return auc

In [5]:
%%time
models = [LogisticRegression(), RidgeClassifier(), Lasso(), ElasticNet(), Lars(), LassoLars(), OrthogonalMatchingPursuit(), BayesianRidge(), SGDClassifier()]
linera_models_res = {type(model): train_model(model) for model in models}

<class 'sklearn.linear_model._logistic.LogisticRegression'>: 0.8682482852413882
<class 'sklearn.linear_model._ridge.RidgeClassifier'>: 0.8674903268400881
<class 'sklearn.linear_model._coordinate_descent.Lasso'>: 0.5
<class 'sklearn.linear_model._coordinate_descent.ElasticNet'>: 0.5
<class 'sklearn.linear_model._least_angle.Lars'>: 0.6607482397230229
<class 'sklearn.linear_model._least_angle.LassoLars'>: 0.5
<class 'sklearn.linear_model._omp.OrthogonalMatchingPursuit'>: 0.8491304245196595
<class 'sklearn.linear_model._bayes.BayesianRidge'>: 0.8674832265732944
<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>: 0.8517903788243746
Wall time: 5.7 s


In [6]:
from sklearn.neighbors import *

In [7]:
%%time
models = [KNeighborsClassifier()]
neighbor_models_res = {type(model): train_model(model) for model in models}

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>: 0.8254580488246859
Wall time: 16.3 s


In [8]:
from sklearn.tree import *

In [9]:
%%time
models = [DecisionTreeClassifier()]
tree_models_res = {type(model): train_model(model) for model in models}

<class 'sklearn.tree._classes.DecisionTreeClassifier'>: 0.7341944761862894
Wall time: 2.25 s


In [10]:
all_res = {}
for res in [linera_models_res, neighbor_models_res, tree_models_res]:
    all_res.update(res)

In [None]:
names = list(all_res.keys())
aucs = list(all_res.values())

pd.DataFrame({'name':names, 'auc': aucs}).to_csv('overview_auc.csv', index=False)

In [None]:
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [14]}
clf = GridSearchCV(RandomForestClassifier(),
                   scoring='roc_auc',
                   param_grid=param_grid, cv=5, n_jobs=4)

print(clf.fit(x_train, y_train))

In [None]:
best_i = np.argmax(clf.cv_results_['mean_test_score'])
print(f"Best auc: {np.max(clf.cv_results_['mean_test_score'])}")
print(f"Best params: {clf.cv_results_['params'][best_i]}")

In [None]:
plt.imshow(clf.cv_results_['mean_test_score'])

In [None]:
clf.cv_results_['params']

In [None]:
pred = clf.predict_proba(x_train)[:, 1]
roc_auc_score(y_train, pred)

In [None]:
y_train.to_numpy().astype(np.int).shape, pred.shape

In [None]:
# aucs = []
# max_depths = []
# for max_depth in range(2,16):
#     cv_results = cross_validate(RandomForestClassifier(max_depth=max_depth),
#                                 x_train, y_train,
#                                 scoring=['roc_auc', 'accuracy'], cv=5)
#     auc = cv_results['test_roc_auc'].mean()
#     aucs.append(auc)
#     max_depths.append(max_depth)
#     print(f"Fa melyseg: {max_depth}\t{auc},\t{cv_results['test_accuracy'].mean()}")

In [None]:
cv_results = cross_validate(DecisionTreeClassifier(max_depth = 14),
                            x_train, y_train,
                            scoring=['roc_auc', 'accuracy'], cv=5, n_jobs=-1)
auc = cv_results['test_roc_auc'].mean()
print(f"\t{auc},\t{cv_results['test_accuracy'].mean()}")

In [None]:
# plt.plot(max_depths, aucs)
# plt.show()
# print(f"Best max_depth: {max_depths[np.argmax(aucs)]}")
# print(f"Best auc: {np.max(aucs)}")

In [None]:
model = ExtraTreesClassifier(max_depth = 14, n_estimators = 1000)
model.fit(x_train, y_train)
pred = model.predict_proba(x_pred)
# accuracy_score(ismert_df[target],model.predict(ismert_df[bemeno_valtozok]))

In [None]:
res_df = pd.DataFrame({'session_id': cust_df[cust_df['test_or_train_flag'] == 1].session_id, 'prob': pred[:, 1]})
res_df.to_csv('res.csv', index=False)