In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

In [None]:
# Load cleaned data
data_type = 'clean_enriched'
data = pd.read_csv(data_type + '_train.csv', sep=';', decimal='.')

In [None]:
# Split data to separate 'y' and 'X'.
y = data['EXTRA_BAGGAGE']

# Drop target variable from X DataFrame
X = data.drop('EXTRA_BAGGAGE', 1)

In [None]:
# We'll apply nested cross-validation in order to avoid the possible bias of applying 
# cross-validation combined with grid-search to perform parameter tuning.

# We'll use nested cross-validation over several classification methods in order to
# obtain their generalization score and choose the one with highest score. After this,
# we'll perform a simple cross-validation with grid-search in order to obtain the 
# best configuration of hyper-parameters for this algorithm. Finally we'll train 
# the model with all the training data and the hyper-parameter configuration that
# yield the best results.

# variace: it removes useless variables with variace equal to zero.
# scaler: standardizes each column to have mean equal to zero and variance equal to 1, 
# in order to give same importance to variables with different measurement units.
lr_pipe = Pipeline([('variance', VarianceThreshold()),
                    ('scaler', StandardScaler()),
                    ('lr', LogisticRegression(class_weight='balanced', random_state=621473))])

param_grid = dict()
param_grid['lr__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(lr_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

lr_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(lr_nested_cv_f1_scores, data_type + '_lr_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(lr_nested_cv_f1_scores), "std =", np.std(lr_nested_cv_f1_scores))
print()

##### Encoded:   Generalization F1 Score: mean = 0.663232566525 std = 0.00577941758843
##### Binarized  Generalization F1 Score: mean = 0.662300707325 std = 0.00639698194352
##### Enriched:  Generalization F1 Score: mean = 0.668949634197 std = 0.00667454322097
##### Enriched2: Generalization F1 Score: mean = 0.668859982719 std = 0.00605384828548

In [None]:
linear_svm_pipe = Pipeline([('variance', VarianceThreshold()),
                            ('scaler', StandardScaler()),
                            ('linear_svm', LinearSVC(penalty='l1', dual=False, random_state=123456, class_weight='balanced'))])

param_grid = dict()
param_grid['linear_svm__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(linear_svm_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

linear_svm_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(linear_svm_nested_cv_f1_scores, data_type + '_linear_svm_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(linear_svm_nested_cv_f1_scores), "std =", np.std(linear_svm_nested_cv_f1_scores))
print()

##### Enriched: Generalization F1 Score: mean = 0.667330992787 std = 0.00671589397452

In [None]:
rf_pipe = Pipeline([('variance', VarianceThreshold()),
                    ('scaler', StandardScaler()),
                    ('rf', RandomForestClassifier(max_features='sqrt', oob_score=True, random_state=573146, class_weight='balanced'))])

param_grid = dict()
param_grid['rf__n_estimators'] = list(range(200, 2300, 300))
# param_grid['rf__max_depth'] = [10, 20, 30]
# param_grid['rf__min_samples_leaf'] = [50, 60, 70, 80, 90, 100]


inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(rf_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

rf_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(rf_nested_cv_f1_scores, data_type + '_rf_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(rf_nested_cv_f1_scores), "std =", np.std(rf_nested_cv_f1_scores))
print()

##### Encoded: Generalization F1 Score: mean = 0.687527970451 std = 0.00504951880128
##### Enriched: Generalization F1 Score: mean = 0.786092676427 std = 0.0114704698622

In [None]:
# Function to plot boxplot of nested cross-validation scores.
def plot_cv_scores(cv_scores):
    print(cv_scores)
    print()
    
    sns.set_style("whitegrid")
    
    ax = sns.boxplot(x=cv_scores, orient="v")
    
    plt.ylim(0, 1)
    
    plt.show()

In [None]:
lr_nested_cv_f1_scores = joblib.load(data_type + '_lr_nested_cv_f1_scores.pkl')

plot_cv_scores(lr_nested_cv_f1_scores)

In [None]:
def plot_variable_ranking(model, model_type, column_names):
    if model_type == "rf":
        importance = model.feature_importances_
        importance = pd.DataFrame(importance, index=column_names, columns=["Importance"])
        importance["Std"] = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
        
        x = range(importance.shape[0])
        y = importance.ix[:, 0]        
        yerr = importance.ix[:, 1]
        
        plt.bar(x, y, yerr=yerr, align="center")
        plt.xticks(range(len(column_names)), column_names)
        plt.show()
    elif model_type == "lr":
        importance = model.coef_
        importance = pd.DataFrame(importance, index=column_names, columns=["Importance"])
        
        x = range(importance.shape[0])
        y = importance.ix[:, 0]        
        
        plt.bar(x, y, align="center")
        plt.xticks(range(len(column_names)), column_names)
        plt.show()

In [None]:
pca_lr_pipe = Pipeline([('variance', VarianceThreshold()),
                    ('scaler', StandardScaler()),
                    ('pca', PCA(random_state=554197)),
                    ('lr', LogisticRegression(class_weight='balanced', random_state=621473))])

param_grid = dict()
param_grid['lr__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid['pca__n_components'] = list(range(2, 14, 2))

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(pca_lr_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

pca_lr_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(pca_lr_nested_cv_f1_scores, data_type + '_pca_lr_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(pca_lr_nested_cv_f1_scores), "std =", np.std(pca_lr_nested_cv_f1_scores))
print

##### Encoded:   Generalization F1 Score: mean = 0.660731584574 std = 0.00561434867445
##### Binarized: Generalization F1 Score: mean = 0.655650138449 std = 0.00599611460643
##### Enriched:  Generalization F1 Score: mean = 0.667960467816 std = 0.00757189854774

In [None]:
pca_lr_nested_cv_f1_scores = joblib.load(data_type + '_pca_lr_nested_cv_f1_scores.pkl')

plot_cv_scores(pca_lr_nested_cv_f1_scores)

In [None]:
pca_rf_pipe = Pipeline([('variance', VarianceThreshold()),
                        ('scaler', StandardScaler()),
                        ('pca', PCA(random_state=554197)),
                        ('rf', RandomForestClassifier(max_features='sqrt', oob_score=True, random_state=573146, class_weight='balanced'))])

param_grid = dict()
param_grid['rf__n_estimators'] = list(range(200, 2300, 300))
param_grid['pca__n_components'] = list(range(2, 14, 2))

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(pca_rf_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

pca_rf_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(pca_rf_nested_cv_f1_scores, data_type + '_pca_rf_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(pca_rf_nested_cv_f1_scores), "std =", np.std(pca_rf_nested_cv_f1_scores))

##### Generalization F1 Score: mean = 0.774613129297 std = 0.00935455148606

In [None]:
pca_rf_nested_cv_f1_scores = joblib.load(data_type + '_pca_rf_nested_cv_f1_scores.pkl')

plot_cv_scores(pca_rf_nested_cv_f1_scores)

In [None]:
rfe_lr_pipe = Pipeline([('variance', VarianceThreshold()),
                    ('scaler', StandardScaler()),
                    ('rfe', RFE(LogisticRegression(class_weight='balanced', random_state=348744), step=4)),
                    ('lr', LogisticRegression(class_weight='balanced', random_state=621473))])

param_grid = dict()
param_grid['lr__C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid['rfe__n_features_to_select'] = list(range(5, 18, 5))
param_grid['rfe__estimator__C'] = [0.1, 1, 10]

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(rfe_lr_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

rfe_lr_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(rfe_lr_nested_cv_f1_scores, data_type + '_rfe_lr_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(rfe_lr_nested_cv_f1_scores), "std =", np.std(rfe_lr_nested_cv_f1_scores))
print()

##### Encoded:   Generalization F1 Score: mean = 0.681933347276 std = 0.0138063133181
##### Binarized: Generalization F1 Score: mean = 0.661206281182 std = 0.00693543250238
##### Enriched:  Generalization F1 Score: mean = 0.685667061666 std = 0.00621622857699

In [None]:
rfe_lr_nested_cv_f1_scores = joblib.load(data_type + '_rfe_lr_nested_cv_f1_scores.pkl')

plot_cv_scores(rfe_lr_nested_cv_f1_scores)

In [None]:
rfe_rf_pipe = Pipeline([('variance', VarianceThreshold()),
                        ('scaler', StandardScaler()),
                        ('rfe', RFE(LogisticRegression(class_weight='balanced', random_state=348744), step=4)),
                        ('rf', RandomForestClassifier(max_features='sqrt', oob_score=True, random_state=573146, class_weight='balanced'))])

param_grid = dict()
param_grid['rf__n_estimators'] = list(range(200, 2300, 300))
param_grid['rfe__n_features_to_select'] = list(range(5, 18, 5))
param_grid['rfe__estimator__C'] = [0.1, 1, 10]

inner_cv = StratifiedKFold(n_splits=10, random_state=975428)
outer_cv = StratifiedKFold(n_splits=10, random_state=248733)

gs_cv = GridSearchCV(rfe_lr_pipe, param_grid=param_grid, n_jobs=-1, scoring='f1_weighted', cv=inner_cv, verbose=10)

rfe_rf_nested_cv_f1_scores = cross_val_score(gs_cv, X, y, cv=outer_cv, verbose=10)

joblib.dump(rfe_rf_nested_cv_f1_scores, data_type + '_rfe_rf_nested_cv_f1_scores.pkl', compress=1)

print("##### Generalization F1 Score: mean =", np.mean(rfe_rf_nested_cv_f1_scores), "std =", np.std(rfe_rf_nested_cv_f1_scores))
print()