In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
btvote = pd.read_pickle('../data/btvote.pkl')
btvote.head()

Unnamed: 0,party,vote_19001,vote_19002,vote_19003,vote_19004,vote_19005,vote_19006,vote_19007,vote_19008,vote_19009,...,vote_19235,vote_19236,vote_19237,vote_19238,vote_19239,vote_19240,vote_19241,vote_19242,vote_19243,vote_19244
0,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,yes
1,SPD,,,,,,,,,,...,yes,yes,yes,yes,yes,yes,no,absence,absence,absence
2,Linke,no,no,no,no,no,no,no,no,yes,...,no,no,no,no,no,no,no,abstain,no,no
3,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,absence
4,Linke,absence,absence,absence,absence,absence,absence,absence,absence,absence,...,no,no,no,no,no,no,no,abstain,no,absence


# Split data and encode target variable

In [3]:
# Split dataframe in 'data' and 'target'
btvote_data = btvote.drop('party', axis=1)
btvote_target = btvote['party']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
btvote_target = label_encoder.fit_transform(btvote_target)

# Pipeline and GridSearch setup

In the pipeline we include the SimpleImputer with the strategy 'most_frequent' as a result of the experiments in the previous notebook.\
As an estimator, we consider:
- k-NearestNeighbor
- NearestCentroid
- Decision Tree
- Naive Bayes

In [4]:
from imblearn.pipeline import Pipeline
# normalisation
from sklearn.preprocessing import OneHotEncoder
# imputer
from sklearn.impute import SimpleImputer
# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# create Pipeline
pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)), ('estimator', None)])

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# define the scoring function
# Note: We use 'macro' as average as we want to evaluate the performance of each class equally, regardless of the class size
f1 = make_scorer(f1_score, average='macro')

# k-NN vs. Nearest Centroid

In [37]:
# set parameter grid
parameters = [
    {
        'estimator': [KNeighborsClassifier()],
        'estimator__n_neighbors': range(2, 10),
        'estimator__weights': ['uniform', 'distance'],
        'estimator__p': [1, 2]  # For Minkowski distance (1 for Manhattan, 2 for Euclidean)'
    }, {
        'estimator': [NearestCentroid()],
        'estimator__metric': ['manhattan', 'euclidean']
    }
]

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__n_neighbors,param_estimator__p,param_estimator__weights,param_estimator__metric,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.07178,0.007276,0.038879,0.004747,KNeighborsClassifier(),2.0,1.0,uniform,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.677551,0.660556,0.67876,0.66416,0.743878,0.63663,0.695054,0.675808,0.028871,21
1,0.090343,0.0093,0.040789,0.002175,KNeighborsClassifier(),2.0,1.0,distance,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.696179,0.668992,0.725647,0.699891,0.694843,0.666064,0.711872,0.696597,0.021862,5
2,0.070742,0.007214,0.024451,0.002589,KNeighborsClassifier(),2.0,2.0,uniform,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.677551,0.660556,0.67876,0.66416,0.743878,0.63663,0.695054,0.675808,0.028871,21
3,0.089886,0.008203,0.023809,0.002067,KNeighborsClassifier(),2.0,2.0,distance,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.696179,0.668992,0.725647,0.699891,0.694843,0.666064,0.711872,0.696597,0.021862,5
4,0.074293,0.008953,0.03719,0.002213,KNeighborsClassifier(),3.0,1.0,uniform,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.716298,0.682216,0.633272,0.708248,0.68029,0.71396,0.665393,0.695739,0.030688,8
5,0.089113,0.010044,0.040131,0.002359,KNeighborsClassifier(),3.0,1.0,distance,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.711048,0.682458,0.626863,0.712058,0.711872,0.701936,0.640655,0.692635,0.033384,11
6,0.074971,0.013734,0.024108,0.001982,KNeighborsClassifier(),3.0,2.0,uniform,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.716298,0.682216,0.633272,0.708248,0.68029,0.71396,0.665393,0.695739,0.030688,8
7,0.090523,0.008418,0.023875,0.002983,KNeighborsClassifier(),3.0,2.0,distance,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.711048,0.682458,0.626863,0.712058,0.711872,0.701936,0.640655,0.692635,0.033384,11
8,0.078557,0.006481,0.041842,0.003526,KNeighborsClassifier(),4.0,1.0,uniform,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.661579,0.64502,0.669437,0.669587,0.699621,0.659211,0.681232,0.672698,0.01736,29
9,0.08988,0.004968,0.04093,0.002572,KNeighborsClassifier(),4.0,1.0,distance,,"{'estimator': KNeighborsClassifier(), 'estimat...",...,0.709447,0.672381,0.656436,0.69011,0.700886,0.671177,0.702138,0.68791,0.015785,13


From this table it is clearly visible that NearestCentroid works better than NearestNeighbor. Out of 34 combinations, the two combinations with NearestCentroid() are the two best. The mean test score is on average 0.02-0.03 better. We see, that the mean test score for NearestCentroid is better when using euclidean distance rather than manhatten distance. 

It makes sense that NearestCentroid performs better than k-NN as our dataset includes some label noise. In that case k-NN loses performance while Nearest Centroid is stable.

From now on, NearestNeighbor will be neglected. The NearestCentroid approach with distance euclidean will be considered in later experiments.

# DecisionTree analysis

In [34]:
# set parameter grid
parameters = {
    'estimator': [DecisionTreeClassifier()],
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [None, 5, 10, 15],
    'estimator__min_samples_split': [2, 5, 10],
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['criterion'] = results['param_estimator__criterion'].astype(str)
results['max_depth'] = results['param_estimator__max_depth'].astype(str)
results['min_samples_split'] = results['param_estimator__min_samples_split']
pivoted_results = results.pivot(index=['criterion','max_depth'], columns='min_samples_split', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc[('Average',''),:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index=['criterion','max_depth'], columns='min_samples_split', values='mean_test_score')


Unnamed: 0_level_0,min_samples_split,2,5,10,Average
criterion,max_depth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
entropy,10.0,0.70196,0.695841,0.704829,0.700877
entropy,15.0,0.689195,0.691164,0.69262,0.690993
entropy,5.0,0.681942,0.686822,0.680722,0.683162
entropy,,0.698142,0.69453,0.701588,0.698087
gini,10.0,0.696995,0.692721,0.687541,0.692419
gini,15.0,0.711291,0.714955,0.69676,0.707669
gini,5.0,0.66704,0.669602,0.658522,0.665054
gini,,0.702608,0.701699,0.694282,0.69953
Average,,0.693647,0.693417,0.689608,0.692224


Observations:
- max_depth: choosing a max_depth of 5 works rather badly. The results of all other approaches are similar
- min_samples_split: On average, there are no real differences, 10 works slightly less good.
- criterion: the gini index is more prone to smaller max_depth. For larger max_depth the average is similar, the top values can be achieved for the gini index.

We choose criterion='gini', max_depth=15 and min_samples_split=5 as the best parameter setting for a DecisionTreeClassifier.

# NearestCentroid vs. DecisionTree vs. NaiveBayes

In [10]:
# set parameter grid
parameters = {
    'estimator': [NearestCentroid(metric='euclidean'),
                  DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=5),
                  GaussianNB()
                  ]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results[['param_estimator','mean_test_score','std_test_score','rank_test_score']])

Unnamed: 0,param_estimator,mean_test_score,std_test_score,rank_test_score
0,NearestCentroid(),0.717709,0.024862,1
1,"DecisionTreeClassifier(max_depth=15, min_sampl...",0.706027,0.023926,2
2,GaussianNB(),0.705059,0.036818,3


The Grid Search cross validation shows a slight advantage for the NearestCentroid estimator, while the DecisionTreeClassifier and the NaiveBayes approach are equally good.
We will validate the results by performing a *nested* cross validation for each of the appraoches. 

In [11]:
from sklearn.model_selection import cross_val_score

# We use stratified 10 fold cross validation for the inner and the outer fold.
print('Nested Cross Validation Mean scores:')

# NearestCentroid
pipeline.set_params(estimator=NearestCentroid(metric='euclidean'))
print('- NearestCentroid: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# NearestCentroid
pipeline.set_params(estimator=DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=5))
print('- DecisionTree: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# NearestCentroid
pipeline.set_params(estimator=GaussianNB())
print('- NaiveBayes: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

Nested Cross Validation Mean scores:
- NearestCentroid: 0.7177086380839047
- DecisionTree: 0.705414996108032
- NaiveBayes: 0.7050591514452207


We see our results from before validated.

# Summary

Until now, we evaluated:
- k-NearestNeighbor
- NearestCentroid
- Decision Tree
- Naive Bayes

We've seen that out of these four model families, the model NearestCentroid with the metric 'euclidean' performs the best with a F1-score of approx. 0.7177

# Extended model evaluation

We will now compare the NearestCentroid to some more complex approaches, that aren't dealt with in detail in the lecture slides or the exercises.

## Neural network: Multi-Layer Perceptron

In [22]:
from sklearn.neural_network import MLPClassifier

# set parameter grid
parameters = {
    'estimator': [MLPClassifier()],
    'estimator__hidden_layer_sizes': [(50,), (50, 50), (100, 50, 25),], # Specify the architecture of hidden layers
    'estimator__alpha': [0.0001, 0.001, 0.01], # L2 regularization term
    'estimator__random_state': [42],  # Random seed for reproducibility
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['hidden_layer_sizes'] = results['param_estimator__hidden_layer_sizes'].astype(str)
results['alpha'] = results['param_estimator__alpha']
pivoted_results = results.pivot(index='hidden_layer_sizes', columns='alpha', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='hidden_layer_sizes', columns='alpha', values='mean_test_score')


alpha,0.0001,0.001,0.01,Average
hidden_layer_sizes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(100, 50, 25)",0.727441,0.730985,0.727006,0.728477
"(50, 50)",0.722445,0.727712,0.72219,0.724116
"(50,)",0.72601,0.724334,0.721371,0.723905
Average,0.725299,0.727677,0.723522,0.725499


We see that using three (or generally more) hidden layers leeds to a higher F1-Score. We now try some larger and wider hidden layers and increase the maximum iterations, as we've occured some warnings, that the maximum number of iterations was reached without convergence. The alpha parameter doesn't effectively change the results, so we will neglect it in the next experiment and just use the default value. 

In [24]:
# set parameter grid
parameters = {
    'estimator': [MLPClassifier()],
    'estimator__hidden_layer_sizes': [(100, 50, 25),(100, 50, 25, 10),(150, 100, 50, 25, 10)], # Specify the architecture of hidden layers
    'estimator__max_iter': [200,300], # Maximum number of gradient steps.
    'estimator__random_state': [42],  # Random seed for reproducibility
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['hidden_layer_sizes'] = results['param_estimator__hidden_layer_sizes'].astype(str)
results['max_iter'] = results['param_estimator__max_iter']
pivoted_results = results.pivot(index='hidden_layer_sizes', columns='max_iter', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
display(pivoted_results)

max_iter,200,300,Average
hidden_layer_sizes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(100, 50, 25)",0.727441,0.727441,0.727441
"(100, 50, 25, 10)",0.743029,0.743029,0.743029
"(150, 100, 50, 25, 10)",0.73289,0.73289,0.73289


We see that a MLP with four hidden layers results in the highest F1-score. In that case, increasing the maximum number of iterations is neither needed for convergence nor beneficial for the overall result.

## RandomForestClassifier

At first, we analyse the criterion and how the number of trees affects the result:

In [33]:
from sklearn.ensemble import RandomForestClassifier

# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'estimator__n_estimators': [10, 50, 100, 150, 200],  # Number of trees in the forest
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['criterion'] = results['param_estimator__criterion'].astype(str)
results['n_estimators'] = results['param_estimator__n_estimators']
pivoted_results = results.pivot(index='criterion', columns='n_estimators', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='criterion', columns='n_estimators', values='mean_test_score')


n_estimators,10,50,100,150,200,Average
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
entropy,0.713631,0.715985,0.720828,0.720625,0.725383,0.71929
gini,0.715285,0.721629,0.713958,0.716208,0.721429,0.717702
Average,0.714458,0.718807,0.717393,0.718417,0.723406,0.718496


The criterion 'entropy' works slightly better than 'gini'. Increasing the n_estimators above 100 (which is the default value) doesn't significantly improve the F1-score. So, we will use the pair ('entropy',100) to evaluate the effect of max_depth and min_samples_split.

In [34]:
# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['entropy'],
    'estimator__max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'estimator__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['max_depth'] = results['param_estimator__max_depth']
results['min_samples_split'] = results['param_estimator__min_samples_split']
pivoted_results = results.pivot(index='max_depth', columns='min_samples_split', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='max_depth', columns='min_samples_split', values='mean_test_score')


min_samples_split,2,5,10,Average
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.720828,0.715816,0.723665,0.720103
10.0,0.726481,0.71797,0.719913,0.721455
20.0,0.732263,0.727962,0.728672,0.729632
30.0,0.739289,0.729779,0.730795,0.733287
Average,0.729715,0.722882,0.725761,0.726119


The results don't differ much for different min_samples_split values, so we will keep the default value 2. A larger max_depth seams to increase the F1-score to a certain extent, but if the depth is to large (max_depth=None) the results are worse again. We will further analyse this now.

In [35]:
# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['entropy'],
    'estimator__max_depth': [10, 20, 30, 40, 50, 60, None],  # Maximum depth of the trees
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results[['param_estimator__max_depth','mean_test_score','std_test_score','rank_test_score']])

Unnamed: 0,param_estimator__max_depth,mean_test_score,std_test_score,rank_test_score
0,10.0,0.726481,0.026108,4
1,20.0,0.732263,0.025497,2
2,30.0,0.739289,0.019331,1
3,40.0,0.731884,0.011489,3
4,50.0,0.7239,0.017138,5
5,60.0,0.721387,0.014123,6
6,,0.720828,0.014265,7


The maximum test score is achieved for max_depth=30.

## Summary

We finally perform a nested cross validation for MLP and RandomForestClassifier to compare it to NearestCentroid.

In [36]:
# We use stratified 10 fold cross validation for the inner and the outer fold.
print('Nested Cross Validation Mean scores:')

# NearestCentroid
pipeline.set_params(estimator=NearestCentroid(metric='euclidean'))
print('- NearestCentroid: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# MLPClassifier
pipeline.set_params(estimator=MLPClassifier(hidden_layer_sizes=(100,50,25,10), random_state=42))
print('- MLPClassifier: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# RandomForestClassifier
pipeline.set_params(estimator=RandomForestClassifier(criterion='entropy', max_depth=30, random_state=42))
print('- RandomForestClassifier: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

Nested Cross Validation Mean scores:
- NearestCentroid: 0.7177086380839047
- MLPClassifier: 0.7430289957523376
- RandomForestClassifier: 0.7392888391424595


The MLP with hidden layers (100, 50, 25, 10) can achieve the highest F1-Score with 0.74. Close behind we have the RandomForestClassifier and lastly the Nearest Centroid.