### Import libralies

In [None]:
import os, sys
print("Current working dir : %s" % os.getcwd())
import pandas as pd
import joblib # save/load models
import pickle # save/load features

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn import svm, datasets
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report, accuracy_score, balanced_accuracy_score, confusion_matrix
from scipy import interp

import warnings
warnings.filterwarnings('ignore')

### Functions used for experiments

In [None]:
def preprocessing(df):
    
    df = df.dropna()
    
    # remove a class has samples < 2
    for i in df.act.unique():
        if(df[df['act'] == i]["index"].count()<2):
            print(df[df['act']==i].act)
            indexNames = df[df['act'] == i].index
            df.drop(indexNames , inplace=True)
    
    return df

In [None]:
def calculate_class_weight(y_train):
    # estimate sample weights by class for unbalanced datasets.
    class_weight = compute_sample_weight(class_weight='balanced', y=y_train)
    return class_weight

In [None]:
def create_model(n_classes):
    # define model
    mcl = XGBClassifier(
                        num_class=n_classes,
                        objective='multi:softprob')
    return mcl

In [None]:
def create_grid_searchCV(model):
    param_grid = {
          'max_depth':[4,5,6],
          'min_child_weight':[4,5,6],
          'early_stopping_rounds': [10],
          'scale_pos_weight' :[1],
         }
        
    # define grid search
    # n_jobs=-1 is a standard of CPU cores to train our model.
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1 , cv=10)
    
    return grid

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [None]:
def evaluate_predictions(y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    balanced_accuracy = balanced_accuracy_score(y_test, predictions)
    print("balanced_accuracy: %.2f%%" % (balanced_accuracy * 100.0))

    roc_auc_score = multiclass_roc_auc_score(y_test, predictions)
    print("roc_auc_score: %.2f%%" % (roc_auc_score * 100.0))

    print(classification_report(y_test, predictions))

### Train the model by each user

In [None]:
# create users bases on user_id in the database
users = [i for i in range(112,185,1)] # 40 
# users = [i for i in range(203,258,1)]

In [None]:
for user in users:
    
    print("------------------------", user, "------------------------")
    
    # read data from a defined user
    user_id = user
    data = pd.read_csv("users-feats/train_data_imwut20_"+str(user_id)+".csv");
    data.time = pd.to_datetime(data.time)
    data = data.reset_index()
    data["index"].count()
    
    # create dataframe
    df = data
    
    # preprocessing
    df = preprocessing(df)
    
    class_names = df.act.unique()
    n_classes = len(df.act.unique())
    
    print("act classes: ", df.act.unique())
    print("number of classes: ", len(df.act.unique()))
    
    X = df.iloc[:,2:14]
    y = df.act
    
    print("X shape: ", X.shape)
    print("y shape: ", y.shape)
    from collections import Counter
    counter = Counter(y)
    counter[1]

    # shuffle and split training and test sets
    random_state = 42
    training_features, test_features, training_target, test_target = train_test_split(X, y, test_size=.1, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(training_features, training_target, test_size=.1, random_state=random_state)

    # create evaluation dataset
    eval_set = [(X_val, y_val)]
    
    # calculate class_weight
    class_weight = calculate_class_weight(y_train)
       
    # create model
    mcl = create_model(n_classes)
    
    # gridsearchCV
    mcl = create_grid_searchCV(mcl)
        
    # fit parameters
    fit_params={"sample_weight": class_weight, "eval_set": eval_set, "verbose":False}

    mcl.fit(X_train, y_train, **fit_params)

    y_pred = mcl.best_estimator_.predict(test_features)
    #y_pred = mcl.predict(test_features)
    predictions = [round(value) for value in y_pred]
    
    #eval
    evaluate_predictions(test_target, predictions)
    
    # save the model to disk
    filename = "users-models/model_user_"+str(user_id)+".sav"
    joblib.dump(mcl, filename)
    
    # Saving the objects:
    filename_objs = "users-models/model_user_"+str(user_id)+".pkl"
    
    with open(filename_objs, 'wb') as f:
        pickle.dump([training_features, test_features, training_target, test_target,
                     X_train, X_val, y_train, y_val,
                     class_weight,class_names,n_classes], f)
        
    print("------------------------------------------------")