In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split

import sys
sys.path.append('/project/dissertation')

# Load custom modules
from utils.config import config as cf
import utils.feature_eng_utils as feature_utils

import warnings
warnings.filterwarnings(action='once')

In [14]:
df = pd.read_csv(cf.DATA_PATH+'/train.csv', parse_dates=['Date'])
df['Description'] = df['Description'].fillna('')

df = feature_utils.preprocess_description(df, True)

Total  285  rows
Total  1040  rows
Total  10  rows


In [15]:
from sklearn.preprocessing import StandardScaler

fasttext_embeddings = np.load(cf.EXPORT_PATH+'/text_embeddings/FastText.npy')
desc_embedding = pd.DataFrame(fasttext_embeddings)

scaler = StandardScaler()
df['Amount_t'] = scaler.fit_transform(df['Amount_logabs'].values.reshape(-1,1))
X = pd.concat([desc_embedding,  df[['isExpense', 'isAcctNo']]], axis=1)
y = df['label']

In [16]:
# Define a function to perform Grid search based on the given model & params
def perform_grid_search(X, y, model, param_grid, cv = cf.CV):
    kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=99)
    grid_results = GridSearchCV(estimator=model, param_grid=param_grid, cv= kfold , scoring='f1_macro', n_jobs=-1, iid = False)
    grid_results.fit(X, y)
    print('> Grid search for {}'.format(model.__class__.__name__))
    print('Best Parameters: ',grid_results.best_params_)
    print('Best Score: ',grid_results.best_score_)
    print('-' * 50)
    return grid_results

In [21]:
rf_model = RandomForestClassifier(class_weight='balanced', random_state = cf.RANDOM_ST)
param_grid_rf = {
 'n_estimators': [50, 200, 500],
 'max_features': ['auto'],
 'max_depth': [None, 20, 40, 60, 80, 100]
}
grid_randf = perform_grid_search(X, y, rf_model, param_grid_rf)



> Grid search for RandomForestClassifier
Best Parameters:  {'max_depth': 40, 'max_features': 'auto', 'n_estimators': 50}
Best Score:  0.8018934853170642
--------------------------------------------------


In [22]:
xtree_model = ExtraTreesClassifier(class_weight='balanced', random_state=cf.RANDOM_ST)
param_grid_xtree = {
 'n_estimators': [50, 200, 500, 700, 1000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [None, 20, 40, 60, 80, 100]
}
grid_xtree = perform_grid_search(X, y, xtree_model, param_grid_xtree)



> Grid search for ExtraTreesClassifier
Best Parameters:  {'max_depth': 20, 'max_features': 'auto', 'n_estimators': 200}
Best Score:  0.8014457858543315
--------------------------------------------------


In [23]:
svc_model = SVC(kernel= 'rbf', gamma='scale', class_weight='balanced', random_state = cf.RANDOM_ST)
param_grid_svc = {
    'C' : [1, 10, 20, 50, 100],
    'gamma' : ['scale', 0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}
grid_svc = perform_grid_search(X, y, svc_model, param_grid_svc)

> Grid search for SVC
Best Parameters:  {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Best Score:  0.8257022383917704
--------------------------------------------------


In [None]:
logreg_model = LogisticRegression(solver='lbfgs', multi_class='auto', class_weight='balanced', random_state = cf.RANDOM_ST)
param_grid_logreg = {
    'C' : [0.1, 1, 10, 100],
    'solver' : ['newton-cg', 'sag', 'saga', 'lbfgs']
}
grid_svc = perform_grid_search(X, y, logreg_model, param_grid_logreg)

In [None]:
xgboost_model = XGBClassifier(verbosity=0, random_state = 99)
param_grid_xgb={
#     'max_depth': [2], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
#     'subsample': [0.6], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
#     'colsample_bytree': [0.5], #[0.5,0.6,0.7,0.8],
    'learning_rate': [0.1, 0.3, 0.5, 0.7],
    'n_estimators': [100, 250, 500]
#     'reg_alpha': [0.01, 0.02, 0.03, 0.04]
}
grid_xgboost = perform_grid_search(X, y, xgboost_model, param_grid_xgb)

