In [6]:
import warnings
warnings.filterwarnings('ignore') # uncomment this line to suppress warnings

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

train = pd.read_csv("../../datasets/classification/train.csv")
test = pd.read_csv("../../datasets/classification/test.csv")

In [7]:
na_cols = []

for col in train.columns:
    if train[train[col].isna()].shape[0] > 0:
        na_cols.append(col)

train = train.dropna(axis=0, subset=na_cols)

X_train = train.drop("satisfaction", axis = 1)
y_train = train["satisfaction"]

na_cols = []

for col in test.columns:
    if test[test[col].isna()].shape[0] > 0:
        na_cols.append(col)

test = test.dropna(axis=0, subset=na_cols)

X_test = test.drop("satisfaction", axis = 1)
y_test = test["satisfaction"]

In [8]:
X_train["Gender"] = X_train["Gender"].map({"Male": 0, "Female": 1})
X_train["Customer Type"] = X_train["Customer Type"].map({"Loyal Customer": 0, "disloyal Customer": 1})
X_train["Type of Travel"] = X_train["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1})
X_train["Class"] = X_train["Class"].map({"Eco Plus": 0, "Business": 1, "Eco": 2})

X_test["Gender"] = X_test["Gender"].map({"Male": 0, "Female": 1})
X_test["Customer Type"] = X_test["Customer Type"].map({"Loyal Customer": 0, "disloyal Customer": 1})
X_test["Type of Travel"] = X_test["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1})
X_test["Class"] = X_test["Class"].map({"Eco Plus": 0, "Business": 1, "Eco": 2})

In [9]:
def print_results(model):
    print("Best parameters set found on train set:")
    print()
    # if best is linear there is no gamma parameter
    print(model.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = model.cv_results_['mean_test_score']
    stds = model.cv_results_['std_test_score']
    params = model.cv_results_['params']
    for mean, std, params_tuple in zip(means, stds, params):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params_tuple))
    print()
    print("Detailed classification report for the best parameter set:")
    print()
    print("The model is trained on the full train set.")
    print("The scores are computed on the full test set.")
    print()
    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [10]:
model_lbls = [
    'dt', 
    'nb', 
    'lp', 
    'svc', 
    'knn'
]

# Set the parameters to be explored by the grid for each classifier
tuned_param_dt = [{'max_depth': list(range(1,20))}]
tuned_param_nb = [{'var_smoothing': [1e-5]}]#[10, 1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-07, 1e-8, 1e-9, 1e-10]}]
tuned_param_lp = [{'early_stopping': [True]}]
tuned_param_svc = [
    # {'kernel': ['rbf'], 
    #                 'gamma': [1e-3, 1e-4],
    #                 'C': [1, 10, 100, 1000],
    #                 },
                    {'kernel': ['linear'],
                     'C': [100]#[1, 10, 100, 1000],                     
                    },
                   ]
tuned_param_knn =[{'n_neighbors': [5]}]#, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

# set the models to be fitted specifying name, estimator and parameter structure
models = {
    'dt': {'name': 'Decision Tree       ',
           'estimator': DecisionTreeClassifier(), 
           'param': tuned_param_dt,
          },
    'nb': {'name': 'Gaussian Naive Bayes',
           'estimator': GaussianNB(),
           'param': tuned_param_nb
          },
    'lp': {'name': 'Linear Perceptron   ',
           'estimator': Perceptron(),
           'param': tuned_param_lp,
          },
    'svc':{'name': 'Support Vector      ',
           'estimator': SVC(), 
           'param': tuned_param_svc
          },
    'knn':{'name': 'K Nearest Neighbor ',
           'estimator': KNeighborsClassifier(),
           'param': tuned_param_knn
        
    }
}

# scores to be explored
scores = [
    "precision", 
    "recall",
    "f1",
    "accuracy"
]

In [11]:
results_short = {}

for score in scores:
    print('='*40)
    print("# Tuning hyper-parameters for %s" % score)
    print()

    #'%s_macro' % score ## is a string formatting expression
    # the parameter after % is substituted in the string placeholder %s
    for m in model_lbls:
        print('-'*40)
        print("Trying model {}".format(models[m]['name']))
        clf = GridSearchCV(models[m]['estimator'], models[m]['param'], cv=5,
                           scoring=('%s_macro' % score) if score != "accuracy" else score, 
#                            iid = False, 
                           return_train_score = False,
                           n_jobs = 2, # this allows using multi-cores
                           )
        clf.fit(X_train, y_train)
        print_results(clf)
        results_short[m] = clf.best_score_
    print("Summary of results for {}".format(score))
    print("Estimator")
    for m in results_short.keys():
        print("{}\t - score: {:5.2f}%".format(models[m]['name'], results_short[m]*100))

# Tuning hyper-parameters for precision

----------------------------------------
Trying model Decision Tree       
Best parameters set found on train set:

{'max_depth': 14}

Grid scores on train set:

0.787 (+/-0.003) for {'max_depth': 1}
0.857 (+/-0.006) for {'max_depth': 2}
0.882 (+/-0.003) for {'max_depth': 3}
0.899 (+/-0.004) for {'max_depth': 4}
0.906 (+/-0.002) for {'max_depth': 5}
0.910 (+/-0.005) for {'max_depth': 6}
0.928 (+/-0.004) for {'max_depth': 7}
0.938 (+/-0.002) for {'max_depth': 8}
0.938 (+/-0.010) for {'max_depth': 9}
0.941 (+/-0.012) for {'max_depth': 10}
0.944 (+/-0.015) for {'max_depth': 11}
0.945 (+/-0.021) for {'max_depth': 12}
0.945 (+/-0.026) for {'max_depth': 13}
0.946 (+/-0.021) for {'max_depth': 14}
0.945 (+/-0.024) for {'max_depth': 15}
0.942 (+/-0.028) for {'max_depth': 16}
0.942 (+/-0.026) for {'max_depth': 17}
0.937 (+/-0.040) for {'max_depth': 18}
0.938 (+/-0.034) for {'max_depth': 19}

Detailed classification report for the best parameter set:

The 