In [47]:
import pandas as pd
import numpy as np

import time

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import normalize, Normalizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def calculate_metrics(y_test, y_pred, target, model_name):
    horizons = y_test.columns.values
  
    metrics = pd.DataFrame(classification_report(y_test.squeeze(), y_pred, output_dict=True))
    index = [np.array([model_name for i in range(4)]), metrics.index]
    metrics = metrics.set_index(index)

    return metrics

In [0]:
def execute_baseline(X_train, y_train, X_test, y_test, models, target):
    
    test_metrics_global = None
    train_metrics_global = None

    for name, model in models.items():
        
        best_model = None
        last_metric = float('-inf')
        print("Training "+name+"....")

        start_time = time.time()
        rkf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2)
        for train_index, test_index in rkf.split(X_train, y_train):
            X_train_fold, X_test_fold = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
            y_train_fold, y_test_fold = y_train.iloc[train_index,:], y_train.iloc[test_index,:]
            model.fit(X_train_fold,y_train_fold)
            metric = calculate_metrics(y_test_fold, model.predict(X_test_fold), target, name)["weighted avg"][2]
            if metric > last_metric:
                best_model = model
        print("--- %s seconds ---" % (time.time() - start_time))

        test_pred = best_model.predict(X_test)

        test_metrics = calculate_metrics(y_test, test_pred, target, name)
        
        if test_metrics_global is None:
            test_metrics_global = test_metrics

        else:
            test_metrics_global = test_metrics_global.append(test_metrics)
        
    return test_metrics_global
            

In [0]:
def execute(X_train, y_train, X_test, y_test, models, target="label"):

    print("============== "+target+" ==============")
    
    test_metrics= execute_baseline(X_train, y_train, X_test, y_test, models, target)
    test_metrics.to_pickle("drive/My Drive/Master/BigData/RecSysProduct/metrics/train_metrics_"+target)

    return test_metrics


In [0]:
X_train = pd.read_csv("drive/My Drive/Master/BigData/RecSysProduct/training_data/X_train.csv")
y_train = pd.read_csv("drive/My Drive/Master/BigData/RecSysProduct/training_data/y_train.csv")

X_test = pd.read_csv('drive/My Drive/Master/BigData/RecSysProduct/training_data/X_test.csv')
y_test = pd.read_csv('drive/My Drive/Master/BigData/RecSysProduct/training_data/y_test.csv')

X_train.drop(["idx", "man"], axis=1, inplace=True)
X_test.drop(["idx", "man"], axis=1, inplace=True)

y_train.drop(["idx"], axis=1, inplace=True)
y_test.drop(["idx"], axis=1, inplace=True)

X_train_sample_cut =int( X_train.shape[0]*0.1)
X_test_sample_cut =int(X_train_sample_cut*0.30)

In [0]:
# Decission tree
#distributions_dt = dict(max_depth=list(range(2,16)))
dt_model = DecisionTreeClassifier(max_depth=5)

# AdaBoost with Decission tree
#distributions_ada = {'estimator__base_estimator__max_depth':list(range(2,16))}
ada_cls_model = AdaBoostClassifier(DecisionTreeClassifier(),
                          n_estimators=300)

# Gradient boosting
grboost_model = GradientBoostingClassifier(n_estimators=300,loss='deviance', learning_rate=0.1,
                                                               max_depth=5)

# Gaussian process
kernel = 1.0 * RBF(1.0)
gausspr_model = GaussianProcessClassifier(kernel=kernel)

#Random forest
rand_forest_model = RandomForestClassifier(n_estimators=300,max_depth=5)

et_model = ExtraTreesClassifier(n_estimators=300)

neigh_model = KNeighborsClassifier(n_neighbors=5, weights="distance")

nn_model =  MLPClassifier(hidden_layer_sizes=(100,), learning_rate="adaptive", early_stopping=True)


models = {
    #"decision_tree_classifier": dt_model,
    #"adaboost_tree_classifier": ada_cls_model,
    #"gradient_boost": grboost_model,
    #"random_forest": rand_forest_model,
    "extra_trees": et_model,
    "neural_network": nn_model,
    "knn":neigh_model
}

In [0]:
et_model = ExtraTreesClassifier(n_estimators=300)
model = SelectFromModel(et_model).fit(X_train.iloc[:X_train_sample_cut,:], y_train.iloc[:X_train_sample_cut,:])

In [49]:
%%time
#selected_columns = X_train.columns[model.estimator_.feature_importances_ > model.threshold_]
#X_train_selected = X_train[selected_columns]
#X_test_selected = X_test[selected_columns]

execute(X_train.iloc[:X_train_sample_cut,:], y_train.iloc[:X_train_sample_cut,:], X_test.iloc[:X_test_sample_cut,:], y_test.iloc[:X_test_sample_cut,:], models)

Training extra_trees....




--- 55.719656229019165 seconds ---
Training neural_network....


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


--- 9.240875005722046 seconds ---
Training knn....




--- 99.68665671348572 seconds ---
CPU times: user 3min 7s, sys: 20.4 s, total: 3min 27s
Wall time: 3min


Unnamed: 0,Unnamed: 1,0,1,accuracy,macro avg,weighted avg
extra_trees,precision,0.741414,0.738979,0.740299,0.740197,0.740249
extra_trees,recall,0.770567,0.707337,0.740299,0.738952,0.740299
extra_trees,f1-score,0.755709,0.722812,0.740299,0.739261,0.739961
extra_trees,support,5239.0,4811.0,0.740299,10050.0,10050.0
neural_network,precision,0.762057,0.784022,0.771741,0.773039,0.772572
neural_network,recall,0.817332,0.722095,0.771741,0.769713,0.771741
neural_network,f1-score,0.788727,0.751785,0.771741,0.770256,0.771043
neural_network,support,5239.0,4811.0,0.771741,10050.0,10050.0
knn,precision,0.617677,0.57289,0.595025,0.595283,0.596237
knn,recall,0.585608,0.60528,0.595025,0.595444,0.595025


In [50]:
pd.read_pickle('drive/My Drive/Master/BigData/RecSysProduct/metrics/train_metrics_label_no_selection')

Unnamed: 0,Unnamed: 1,0,1,accuracy,macro avg,weighted avg
decision_tree_classifier,precision,0.643171,0.670335,0.653731,0.656753,0.656175
decision_tree_classifier,recall,0.754152,0.544377,0.653731,0.649265,0.653731
decision_tree_classifier,f1-score,0.694254,0.600826,0.653731,0.64754,0.649529
decision_tree_classifier,support,5239.0,4811.0,0.653731,10050.0,10050.0
adaboost_tree_classifier,precision,0.62296,0.590795,0.607662,0.606878,0.607562
adaboost_tree_classifier,recall,0.626646,0.586988,0.607662,0.606817,0.607662
adaboost_tree_classifier,f1-score,0.624798,0.588885,0.607662,0.606842,0.607606
adaboost_tree_classifier,support,5239.0,4811.0,0.607662,10050.0,10050.0
gradient_boost,precision,0.759933,0.751546,0.75602,0.75574,0.755918
gradient_boost,recall,0.777629,0.732488,0.75602,0.755059,0.75602


In [0]:
nn_model =  MLPClassifier(hidden_layer_sizes=(100,), learning_rate="adaptive", early_stopping=True)