In [66]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [67]:
def visualize_result(y_true, y_predict, model):
    print(classification_report(y_true, y_predict))
    print()
    print(confusion_matrix(y_true, y_predict))
    print()
    print(roc_auc_score(y_true, model.predict_proba(X_test)[:, 1]))

In [68]:
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv').iloc[:,0]
y_test = pd.read_csv('data/y_test.csv').iloc[:,0]

In [69]:
X_train.head(1)

Unnamed: 0,device_category_desktop,device_category_mobile,device_category_tablet,device_os_(not set),device_os_Windows,device_os_iOS,device_os_nan,device_browser_Mozilla Compatible Agent,device_browser_Safari (in-app),geo_country_Armenia,...,geo_city_customised_Prineville,geo_city_customised_Pyatigorsk,geo_city_customised_Sochi,geo_city_customised_Stavropol,geo_city_customised_Tomsk,geo_city_customised_Tyumen,geo_city_customised_Vladimir,geo_city_customised_Volgograd,geo_city_customised_Voronezh,geo_city_customised_Zheleznodorozhny
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# I tried to balance data with SMOTE, but it shows worse result

'''from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=10, k_neighbors=5)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


X_train_res.shape, X_train.shape;''';

In [71]:
log_reg_model = LogisticRegression(penalty=None, class_weight='balanced', max_iter=100)
log_reg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
visualize_result(y_test, log_reg_model.predict(X_test), log_reg_model)

              precision    recall  f1-score   support

           0       0.98      0.57      0.72    409866
           1       0.04      0.67      0.08     12374

    accuracy                           0.57    422240
   macro avg       0.51      0.62      0.40    422240
weighted avg       0.96      0.57      0.70    422240


[[232548 177318]
 [  4028   8346]]
0.6757159153280995


In [73]:
pd.DataFrame(data=log_reg_model.coef_, columns=X_train.columns.to_numpy()).T.sort_values(by=0)

Unnamed: 0,0
utm_adcontent_customised_XKsYZiUFcdkUXQpoLKyS,-2.822335
geo_country_Ireland,-2.092340
geo_city_customised_Dublin,-2.061916
utm_adcontent_customised_TuyPWsGQruPMpKvRxeBF,-1.879777
utm_campaign_customised_IZEXUFLARCUMynmHNBGo,-1.781159
...,...
utm_campaign_customised_YCKgTzTDywjcWyQudGch,1.027166
utm_campaign_customised_bgTYkDHjOsJzMUtoGhiQ,1.044312
utm_adcontent_customised_xhoenQgDQsgfEPYNPwKO,1.176644
utm_adcontent_customised_PkybGvWbaqORmxjNunqZ,1.306275


In [74]:
# I tried to use SVC, but it takes too much time

#svc_model = SVC(kernel='linear', class_weight='balanced', max_iter=2000, probability=True)#
#svc_model.fit(X_train, y_train)

In [75]:
#visualize_result(y_test, svc_model.predict(X_test), svc_model)

In [76]:
rfc_model = RandomForestClassifier(max_depth=15, random_state=42)
rfc_model.fit(X_train, y_train)

In [77]:
visualize_result(y_test, rfc_model.predict(X_test), rfc_model)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99    409866
           1       0.00      0.00      0.00     12374

    accuracy                           0.97    422240
   macro avg       0.49      0.50      0.49    422240
weighted avg       0.94      0.97      0.96    422240


[[409866      0]
 [ 12374      0]]
0.6755761189023346


In [78]:
# I tried to adjust parameters, but it takes so much time for my laptop

'''max_depth_ar = list(range(1,25))
roc_auc_scores = []


for cur_max_depth in max_depth_ar:
    dt_model = DecisionTreeClassifier(max_depth=cur_max_depth)#random_state=22,
    dt_model.fit(X_train, y_train)
    roc_auc_scores.append(roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1]))''';

In [79]:
'''sns.lineplot(x=max_depth_ar, y=roc_auc_scores)''';

In [80]:
dt_model = DecisionTreeClassifier(max_depth=10)
dt_model.fit(X_train, y_train)

In [81]:
visualize_result(y_test, dt_model.predict(X_test), dt_model)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    409866
           1       0.25      0.00      0.00     12374

    accuracy                           0.97    422240
   macro avg       0.61      0.50      0.49    422240
weighted avg       0.95      0.97      0.96    422240


[[409805     61]
 [ 12354     20]]
0.6646810240868807


In [82]:
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

In [83]:
visualize_result(y_test, gnb_model.predict(X_test), gnb_model)

              precision    recall  f1-score   support

           0       0.99      0.14      0.25    409866
           1       0.03      0.96      0.06     12374

    accuracy                           0.17    422240
   macro avg       0.51      0.55      0.16    422240
weighted avg       0.96      0.17      0.24    422240


[[ 58593 351273]
 [   434  11940]]
0.557259116037255


In [89]:
# I just thought that we can combine two models and maybe get better result!

_ = (log_reg_model.predict_proba(X_test)[:, 1] + rfc_model.predict_proba(X_test)[:, 1] + dt_model.predict_proba(X_test)[:, 1])

y_predict = [1 if value >= 1.2 else 0 for value in _]

print(classification_report(y_test, y_predict))
print()
print(confusion_matrix(y_test, y_predict))
print()
print(roc_auc_score(y_test, _))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    409866
           1       0.26      0.00      0.01     12374

    accuracy                           0.97    422240
   macro avg       0.61      0.50      0.50    422240
weighted avg       0.95      0.97      0.96    422240


[[409747    119]
 [ 12333     41]]

0.6782193061736599


In [85]:
# There we go! We have 0.68 Roc_auc
# Now I'm going to save these models and make server with predictions!

In [86]:
# Maybe you have questions about 'cross_validation' and why 
# I didn't use it cuz my computer will die. I repeated Teaching process many times with new data, and approximately I get same result!