In [1]:
# import pandas for data wrangling
import pandas as pd


# import numpy for Scientific computations
import numpy as np


# import machine learning libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# Any results you write to the current directory are saved as output.

In [2]:
# labels:
# 0 = failing
# 1 = stable
# 2 = growing

def get_label_numeric(data):
    y = get_label(data)  # one row per case
    return [2 if label == 'growing' else 1 if label == 'stable' else 0 for label in y]

def get_label(data):
    return data["label"]



In [3]:
def read_data(file):    
    df_all_v1 = pd.read_csv(file)
    try:
        df_all_v1 = df_all_v1.drop(['Unnamed: 0', 'Registration_date', 'EMTAK_by_number', 'Client_ID_by_Eesti_Pank'],axis=1)
    except: 
        df_all_v1 = df_all_v1.drop(['Registration_date', 'EMTAK_by_number', 'Client_ID_by_Eesti_Pank'],axis=1)
        
    df_all_v1 = df_all_v1.sort_values(by=['year_quarter'])
    
    return df_all_v1

In [4]:
files = ['./data/full_v1.csv',
        './data/full_v2.csv',
        './data/before_v1.csv',
        './data/before_v2.csv',
        './data/during_v1.csv',
        './data/during_v2.csv']

dfs = {}
for file in files:
    name = file.split('/')[2]
    df = read_data(file)
    dfs[name] = df
    #break

In [5]:
data = dfs['during_v1.csv'].reset_index(drop=True)
data = data.drop([ 'Registration_number', 'year_quarter'], axis=1)
data['label'] = [2 if label == 'growing' else 1 if label == 'stable' else 0 for label in data['label']]

train_df, test_df = np.split(data, [int(0.3 *len(df))]) 
train_df.shape, test_df.shape


((164194, 15), (383122, 15))

In [None]:
# Run an XGBoost model with hyperparmaters that are optimized using hyperopt
# The output of the script are the best hyperparmaters
# The optimization part using hyperopt is partly inspired from the following script: 
# https://github.com/bamine/Kaggle-stuff/blob/master/otto/hyperopt_xgboost.py


# Data wrangling

import pandas as pd

# Scientific 

import numpy as np


# Machine learning

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Hyperparameters tuning

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Some constants

SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'label'

# Scoring and optimization functions


def score(params):
    print("Training with params: ")
    #print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(train_features, label=y_train)
    dvalid = xgb.DMatrix(valid_features, label=y_valid)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    params['num_class'] = len(np.unique(y_train))
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          verbose_eval=True)
    predictions = gbm_model.predict(dvalid,
                                    ntree_limit=gbm_model.best_iteration + 1)
    #predictions = np.argmax(predictions, axis =1)

    print((predictions))
    print((y_valid))
    score = roc_auc_score(y_valid, predictions, multi_class='ovr', average="macro")
    # TODO: Add the importance for the selected features
    print("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}


def optimize(
             #trials, 
             random_state=SEED):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'merror', # Multiclass classification error rate. It is calculated as 
        'objective': 'multi:softprob',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=50)
    return best

#-------------------------------------------------#


# Load processed data

train_df = train_df
test_df = test_df


#-------------------------------------------------#



# Extract the train and valid (used for validation) dataframes from the train_df

train, valid = train_test_split(train_df, test_size=VALID_SIZE,
                                random_state=SEED, stratify=train_df['label'])

y_train = train[TARGET]
y_valid = valid[TARGET]


train_features = train.drop([ 'label'], axis=1)
valid_features = valid.drop([ 'label'], axis=1)

# print(f"train.shape: {train_features.shape} \n,\
# valid.shape: {valid_features.shape} \n,\
# len(y_train): {len(y_train)} \n \
# len(y_valid): {len(y_valid)} ")

print('The training set is of length: ', len(train.index))
print('The validation set is of length: ', len(valid.index))

#-------------------------------------------------#

# Run the optimization

# Trials object where the history of search will be stored
# For the time being, there is a bug with the following version of hyperopt.
# You can read the error messag on the log file.
# For the curious, you can read more about it here: https://github.com/hyperopt/hyperopt/issues/234
# => So I am commenting it.
# trials = Trials()

best_hyperparams = optimize(
                            #trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)


The training set is of length:  131355
The validation set is of length:  32839
Training with params:                                 
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	eval-merror:0.23113	train-merror:0.22256          

[1]	eval-merror:0.23600	train-merror:0.22517          

[2]	eval-merror:0.22665	train-merror:0.21585          

[3]	eval-merror:0.22623	train-merror:0.21551          

[4]	eval-merror:0.22291	train-merror:0.21224          

[5]	eval-merror:0.22230	train-merror:0.20961          

[6]	eval-merror:0.22132	train-merror:0.20810          

[7]	eval-merror:0.21959	train-merror:0.20583          

[8]	eval-merror:0.21916	train-merror:0.20400          

[9]	eval-merror:0.21873	train-merror:0.20282          

[10]	eval-merror:0.21822

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import classification_report

    
for data_name in dfs.keys():
    print(data_name)
    data = dfs[data_name].reset_index(drop=True)
    data = data.drop([ 'Registration_number', 'year_quarter'], axis=1)
    data['label'] = [2 if label == 'growing' else 1 if label == 'stable' else 0 for label in data['label']]

    train_df, test_df = np.split(data, [int(0.3 *len(df))]) 
    print(train_df.shape, test_df.shape)
    #break

    train_df = train_df
    test_df = test_df

    y_train = np.array(train_df['label'])
    X_train = train_df.drop(['label'], axis=1)


    y_test = np.array(test_df['label'])
    X_test = test_df.drop(['label'], axis=1)

    clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
             gamma=best_hyperparams['gamma'], max_delta_step=0.0, min_child_weight=best_hyperparams['min_child_weight'],
             missing=None, n_jobs=-1, objective='multi:softprob', random_state=42, reg_alpha=0.0,
             reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
             colsample_bytree =best_hyperparams['colsample_bytree'], 
             eta = best_hyperparams['eta'], 
             max_depth = best_hyperparams['max_depth'], 
             subsample = best_hyperparams['subsample'], 
             n_estimators = int(best_hyperparams['n_estimators']),
                              nthread= 4, use_label_encoder=False)
    clf.fit(X_train, y_train)
    #predicting the data
    y_pred = clf.predict(X_test)
    y_prob_pred = clf.predict_proba(X_test)

    #roc auc score
    print(roc_auc_score(y_test, y_prob_pred, multi_class='ovo', average='weighted'))
    
    
    

    target_names = ['Failing: 0', 'Stabel: 1', 'Growing: 2']
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=target_names))

    clf_report = classification_report(y_test,
                                       y_pred,
                                       labels=np.arange(3),
                                       target_names=target_names,
                                       output_dict=True)


    # .iloc[:-1, :] to exclude support
    
    sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)

    import matplotlib.pyplot as plt
    import numpy as np
    plt.tight_layout()

    plt.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T, cmap='hot', interpolation='nearest',aspect='auto')
    plt.show()

    # roc curve for classes
    fpr = {}
    tpr = {}
    thresh ={}

    n_class = 3

    for i in range(n_class):    
        fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_prob_pred[:,i], pos_label=i)

    # plotting    
    plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Failing vs Rest')
    plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Stabel vs Rest')
    plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Growing vs Rest')
    #plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label='Class 3 vs Rest')
    plt.title('Multiclass ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.grid(True)
    plt.savefig('Multiclass ROC',dpi=300); 
    print("=================================================")
    break

In [None]:
# import numpy as np
# import seaborn as sns
# from sklearn.metrics import classification_report
# import pandas as pd
# from sklearn.metrics import classification_report

# target_names = ['Failing: 0', 'Stabel: 1', 'Growing: 2']
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred, target_names=target_names))

# clf_report = classification_report(y_test,
#                                    y_pred,
#                                    labels=np.arange(3),
#                                    target_names=target_names,
#                                    output_dict=True)


# # .iloc[:-1, :] to exclude support
# sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)


In [None]:
# from matplotlib import pyplot as plt

# sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)

# import matplotlib.pyplot as plt
# import numpy as np
# plt.tight_layout()

# plt.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T, cmap='hot', interpolation='nearest',aspect='auto')
# plt.show()

# # roc curve for classes
# fpr = {}
# tpr = {}
# thresh ={}

# n_class = 3

# for i in range(n_class):    
#     fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_prob_pred[:,i], pos_label=i)
    
# # plotting    
# plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Failing vs Rest')
# plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Stabel vs Rest')
# plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Growing vs Rest')
# #plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label='Class 3 vs Rest')
# plt.title('Multiclass ROC curve')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive rate')
# plt.legend(loc='best')
# plt.grid(True)
# plt.savefig('Multiclass ROC',dpi=300); 