In [None]:
# import sys
# !{sys.executable} -m pip install scikit-learn --user
# !{sys.executable} -m pip install sklearn_pandas --user
# !{sys.executable} -m pip install lightgbm --user
# !{sys.executable} -m pip install matplotlib --user
# !{sys.executable} -m pip install hyperopt --user

In [None]:
from time import time
from hyperopt import tpe, fmin, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from copy import deepcopy
from sklearn.neural_network import MLPClassifier
import numpy as np

# Data science imports
import pandas as pd
from enum import Enum
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn_pandas import DataFrameMapper

# Visualisation imports
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# File IO: Read the data file
file = open('train_data.txt', mode = 'r')
lines = file.readlines()
file.close()

In [None]:
# Check for completeness
len(lines)

In [None]:
# Pre-processing: Clean unwanted characters 
data = []
i = 0
while i < len(lines)-1:
    row = {}
    row['id'] = lines[i+1].split(" ")[1].strip()
    row['durationOfStay'] = lines[i+2].split(" ")[1].strip()
    row['gender'] = lines[i+3].split(" ")[1].strip()
    row['Age'] = lines[i+4].split(" ")[1].strip()
    row['kids'] = lines[i+5].split(" ")[1].strip()
    row['destinationCode'] = lines[i+6].split(" ")[1].strip()
    row['AcomType'] = lines[i+7].split(" ")[1].strip()
    data.append(row)
    i = i + 8

In [None]:
# Structure the data
def structure_data_to_df(data, test_data=False):
    if test_data:
        df = pd.DataFrame(data, columns=['id','durationOfStay','gender','Age','kids','destinationCode'])
        df['id'] = df['id'].astype(str)
    else:
        df = pd.DataFrame(data, columns=['durationOfStay','gender','Age','kids','destinationCode','AcomType'])
        df['AcomType'] = df['AcomType'].astype(str)
        
    df['gender'] = df['gender'].astype(str)
    df['kids'] = df['kids'].astype(str)
    df['destinationCode'] = df['destinationCode'].astype(str)
    df = df.replace('<NA>', np.nan)
    df = df.replace('NA', np.nan)
    df = df.replace('nan', np.nan)
    df[['durationOfStay', 'Age']] = df[['durationOfStay', 'Age']].apply(pd.to_numeric)
    
    return df
    
df = structure_data_to_df(data)

In [None]:
# utility variables
categorical_cols = ['gender']
missing_categorical_cols = ['kids', 'destinationCode']
numerical_cols = ['durationOfStay']
missing_numerical_cols = ['Age']

# transform utility function
def build_imputer_and_transformer(missing_categorical_cols, categorical_cols, 
                                  missing_numerical_cols, numerical_cols, constant=True):
    transformations = []
    
    for col in missing_categorical_cols:
        if constant:
            transformations.append(([col], [SimpleImputer(strategy='constant', fill_value='N/A'), LabelBinarizer()]))
        else:
            transformations.append(([col], [SimpleImputer(strategy='most_frequent'), LabelBinarizer()]))
        
    for col in categorical_cols:
        transformations.append((col, LabelBinarizer()))
        
    transformations.append((missing_numerical_cols, [SimpleImputer(strategy='mean'), StandardScaler()]))
    transformations.append((numerical_cols, StandardScaler()))
   
    transformer = DataFrameMapper(transformations)
    return transformer

In [None]:
y = df['AcomType'].copy()
X = df.drop(['AcomType'], axis=1)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
    
transformer = build_imputer_and_transformer(missing_categorical_cols, categorical_cols,
                                             missing_numerical_cols, numerical_cols, constant=True)
    
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [None]:
kernel_list = ['rbf', 'poly']
# kernel_list = ['linear']
class_weights = ['balanced']

In [None]:
def optimize(evals, trials, optimizer=tpe.suggest):
    space = {
        'kernel': hp.choice('kernel', kernel_list),
        'class_weight': hp.choice('class_weight', class_weights),
        'C': hp.uniform('C', 0, 20),
        'gamma': hp.uniform('gamma',0.01,10)
    }
    best = fmin(score, space, algo=optimizer, max_evals=evals, trials=trials)
    pbar.close()
    return best

In [None]:
def score(params):
     
    c = params['C']
    gamma = params['gamma']
    kernel = params['kernel']
    class_weight = params['class_weight']
    
    model = SVC(kernel=kernel,
            C=c,
            random_state=42,
            decision_function_shape='ovr',
            class_weight=class_weight,
            gamma=gamma)

    scores = cross_val_score(model, X, y, cv=5, scoring='balanced_accuracy', n_jobs=20)  
    
    scores_mean = scores.mean()

    
    print('Parameters with this training score {} :'.format(scores_mean))
    print(params)
    pbar.update()
#     return {'loss': test_loss, 'status': STATUS_OK}
    return {'loss': 1-scores_mean, 'status': STATUS_OK}

In [None]:
import time
from tqdm import tqdm

trials = Trials()
cores = 20
start = time.time()
evaluations = 1000
pbar = tqdm(total=evaluations, desc="Hyperopt")
best_param = optimize(evals=evaluations,
                      optimizer=tpe.suggest,
                      trials=trials)
print("------------------------------------")
print("The best hyperparameters are: ", "\n")
print(best_param)
end = time.time()
print('Time elapsed to optimize {0} executions: {1}'.format(evaluations, end - start))

In [None]:
best_param['kernel'] = kernel_list[best_param['kernel']]
best_param['class_weight'] = class_weights[best_param['class_weight']]
print('\n Best score:')
score(best_param)

In [None]:
# get the best params and train the model
SVC_opt = SVC(kernel=best_param['kernel'],
            C=best_param['C'],
            random_state=42,
            decision_function_shape='ovr',
            class_weight=best_param['class_weight'],
            gamma=best_param['gamma'])

fitted_model = SVC_opt.fit(X=X_train,y=y_train)

In [None]:
y_score = fitted_model.predict_proba(X_test)
y_score

In [None]:
test_acc = accuracy_score(y_test, fitted_model.predict(X_test))
test_acc