In [39]:
import numpy as np
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import time
from statistics import pstdev
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import *
from sklearn.dummy import DummyClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GroupKFold

In [2]:
h5 = h5py.File('data/SG24_dataset.h5')

In [3]:
print(list(h5.keys()))
print(h5['Predictors'])
print(h5['Target'])
print(h5['User'])

['Predictors', 'Target', 'User']
<HDF5 dataset "Predictors": shape (29, 2400), type "<f8">
<HDF5 dataset "Target": shape (1, 2400), type "|i1">
<HDF5 dataset "User": shape (1, 2400), type "|i1">


In [4]:
predictors = np.asarray(h5['Predictors'])
pred = pd.DataFrame(predictors)

In [5]:
target = np.asarray(h5['Target'])
tar = pd.DataFrame(target[0])

In [6]:
users = np.asarray(h5['User'])
user = pd.DataFrame(users[0])

In [7]:
predictors = predictors[3:]
normalized_predictors = (predictors - np.mean(predictors, axis = 0))/ np.std(predictors, axis = 0)
final_predictors = normalized_predictors.transpose()
target = target[0]
print(final_predictors.shape)
print(target.shape)

(2400, 26)
(2400,)


# train aléa / test aléa

In [8]:
x_train,x_test,y_train,y_test=train_test_split(final_predictors, target, test_size=0.15, shuffle=True)

## XGBoost

In [15]:
GB = GradientBoostingClassifier(loss='deviance',max_features='log2',n_estimators=60,max_depth=8)

pgrid = {
    "learning_rate": [0.075, 0.08,0.09,0.1, 0.15],
    "min_samples_split": [0.1,0.2,0.3,0.4,0.5],
    "min_samples_leaf": [0.1,0.2,0.3,0.4,0.5]
    }

grid_search = GridSearchCV(GB, param_grid=pgrid, cv=7, scoring='accuracy')
grid_search.fit(x_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9406981123193523
{'learning_rate': 0.15, 'min_samples_leaf': 0.1, 'min_samples_split': 0.2}


In [24]:
GB = GradientBoostingClassifier(loss='deviance',max_features='log2',n_estimators=60,max_depth=8, learning_rate=0.15, min_samples_leaf=0.1, min_samples_split=0.2)
start_time = time.time()
GB.fit(x_train, y_train)
print("train time : ",time.time() - start_time)

start_time = time.time()
yhat_rf=GB.predict(x_test)
print("test time : ",time.time() - start_time)

print("accuracy train : ",accuracy_score(y_train,GB.predict(x_train)))
print("accuracy test : ",accuracy_score(y_test,yhat_rf))

train time :  9.433748722076416
test time :  0.024954795837402344
accuracy train :  1.0
accuracy test :  0.9333333333333333


# Bagging

In [26]:
Ba = BaggingClassifier(n_estimators = 100, max_features = 0.5)

pgrid = {
    'bootstrap': [True, False],
    'bootstrap_features': [True, False], 
    'max_samples' : [0.6, 0.8, 1.0]
    }

grid_search = GridSearchCV(Ba, param_grid=pgrid, cv=7, scoring='accuracy')
grid_search.fit(x_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9455955239036725
{'bootstrap': False, 'bootstrap_features': False, 'max_samples': 1.0}


In [29]:
Ba=BaggingClassifier(bootstrap=False, bootstrap_features=False, max_samples=1.0, n_estimators = 100, max_features = 0.5)
start_time = time.time()
Ba.fit(x_train, y_train)
print("train time : ",time.time() - start_time)

start_time = time.time()
yhat_rf=Ba.predict(x_test)
print("test time : ",time.time() - start_time)

print("accuracy train : ",accuracy_score(y_train,Ba.predict(x_train)))
print("accuracy test : ",accuracy_score(y_test,yhat_rf))

train time :  5.10634708404541
test time :  0.036031484603881836
accuracy train :  1.0
accuracy test :  0.9472222222222222


# test by users train aléa

In [43]:
def experience_by_user(user):
    l=np.where(np.array(users[0]) == user)[0]
    x_train=[final_predictors[i] for i in range(len(final_predictors)) if i not in l]
    print("taille x_train : ",len(x_train))
    y_train=[target[i] for i in range(len(final_predictors)) if i not in l]
    x_test=[final_predictors[i] for i in l]
    print("taille x_test : ",len(x_test))
    y_test=[target[i] for i in l]
    
    print("\n----------------- Boosting -----------------")
    GB = GradientBoostingClassifier(loss='deviance',max_features='log2',n_estimators=60,max_depth=8, learning_rate=0.15, min_samples_leaf=0.1, min_samples_split=0.2)
    start_time = time.time()
    GB.fit(x_train, y_train)
    print("train time : ",time.time() - start_time)
    start_time = time.time()
    yhat_rf=GB.predict(x_test)
    print("test time : ",time.time() - start_time)
    print("accuracy train : ",accuracy_score(y_train,GB.predict(x_train)))
    print("accuracy test : ",accuracy_score(y_test,yhat_rf))
    
    print("\n----------------- Bagging -----------------")
    Ba=BaggingClassifier(bootstrap=False, bootstrap_features=False, max_samples=1.0, n_estimators = 100, max_features = 0.5)
    start_time = time.time()
    Ba.fit(x_train, y_train)
    print("train time : ",time.time() - start_time)
    start_time = time.time()
    yhat_rf=Ba.predict(x_test)
    print("test time : ",time.time() - start_time)
    print("accuracy train : ",accuracy_score(y_train,Ba.predict(x_train)))
    print("accuracy test : ",accuracy_score(y_test,yhat_rf))

In [44]:
experience_by_user(1)

taille x_train :  1800
taille x_test :  600

----------------- Boosting -----------------
train time :  10.79246211051941
test time :  0.038450002670288086
accuracy train :  1.0
accuracy test :  0.85

----------------- Bagging -----------------
train time :  5.529829978942871
test time :  0.05118417739868164
accuracy train :  1.0
accuracy test :  0.8416666666666667


In [34]:
experience_by_user(8)

taille x_train :  2280
taille x_test :  120

----------------- Boosting -----------------
train time :  13.385536432266235
test time :  0.010379791259765625
accuracy train :  1.0
accuracy test :  0.8

----------------- Bagging -----------------
train time :  7.28057861328125
test time :  0.033692359924316406
accuracy train :  1.0
accuracy test :  0.8666666666666667


# test by users train by users

In [37]:
l=np.where(np.array(users[0]) == 1)[0]
x_train=[final_predictors[i] for i in range(len(final_predictors)) if i not in l]
print("taille x_train : ",len(x_train))
y_train=[target[i] for i in range(len(final_predictors)) if i not in l]
x_test=[final_predictors[i] for i in l]
print("taille x_test : ",len(x_test))
y_test=[target[i] for i in l]
user_train = [users[0][i] for i in range(len(final_predictors)) if i not in l]
user_test = [users[0][i] for i in l]

taille x_train :  1800
taille x_test :  600


In [40]:
Ba = BaggingClassifier(n_estimators = 100, max_features = 0.5)
group_kfold = GroupKFold(n_splits=7)
iterator = group_kfold.split(x_train, y_train, user_train) 

pgrid = {
    'bootstrap': [True, False],
    'bootstrap_features': [True, False], 
    'max_samples' : [0.6, 0.8, 1.0]
    }

grid_search = GridSearchCV(Ba, param_grid=pgrid, cv=7, scoring='accuracy')
grid_search.fit(x_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9388743100171932
{'bootstrap': True, 'bootstrap_features': False, 'max_samples': 0.6}
train time :  5.701731443405151
test time :  0.050951480865478516
accuracy train :  1.0
accuracy test :  0.8033333333333333


In [41]:
Ba=BaggingClassifier(bootstrap=True, bootstrap_features=False, max_samples=0.6, n_estimators = 100, max_features = 0.5)
start_time = time.time()
Ba.fit(x_train, y_train)
print("train time : ",time.time() - start_time)

start_time = time.time()
yhat_rf=Ba.predict(x_test)
print("test time : ",time.time() - start_time)

print("accuracy train : ",accuracy_score(y_train,Ba.predict(x_train)))
print("accuracy test : ",accuracy_score(y_test,yhat_rf))

train time :  2.4116711616516113
test time :  0.0488889217376709
accuracy train :  0.9988888888888889
accuracy test :  0.8133333333333334
