In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
btvote = pd.read_pickle('../data/btvote_encoded.pkl')
btvote.head()

Unnamed: 0,party,vote_19001,vote_19002,vote_19003,vote_19004,vote_19005,vote_19006,vote_19007,vote_19008,vote_19009,...,vote_19235,vote_19236,vote_19237,vote_19238,vote_19239,vote_19240,vote_19241,vote_19242,vote_19243,vote_19244
0,CDU/CSU,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,SPD,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,,,
2,Linke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
3,CDU/CSU,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,
4,Linke,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,


# Split data and encode target variable

In [3]:
# Split dataframe in 'data' and 'target'
btvote_data = btvote.drop('party', axis=1)
btvote_target = btvote['party']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
btvote_target = label_encoder.fit_transform(btvote_target)

# Pipeline and GridSearch setup

In the pipeline we include the SimpleImputer with the strategy 'most_frequent' as a result of the experiments in the previous notebook.\
As an estimator, we consider:
- k-NearestNeighbor
- NearestCentroid
- Decision Tree
- Naive Bayes

In [4]:
from sklearn.pipeline import Pipeline
# imputer
from sklearn.impute import KNNImputer
# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# create Pipeline
pipeline = Pipeline([('imputer', KNNImputer(n_neighbors=1)), ('estimator', None)])

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# define the scoring function
# Note: We use 'macro' as average as we want to evaluate the performance of each class equally, regardless of the class size
f1 = make_scorer(f1_score, average='macro')

# k-NN vs. Nearest Centroid

In [28]:
# set parameter grid
parameters = [
    {
        'estimator': [KNeighborsClassifier()],
        'estimator__n_neighbors': range(2, 10),
        'estimator__weights': ['uniform', 'distance'],
        'estimator__p': [1, 2]  # For Minkowski distance (1 for Manhattan, 2 for Euclidean)'
    }, {
        'estimator': [NearestCentroid()],
        'estimator__metric': ['manhattan', 'euclidean']
    }
]

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['classifier'] = results['param_estimator'].astype(str)
results['n_neighbors'] = results['param_estimator__n_neighbors'].astype(str)
results['weights'] = results['param_estimator__weights'].astype(str)
results.loc[results['classifier']=='KNeighborsClassifier()', 'metric'] = results['param_estimator__p'].replace({1:'manhattan', 2:'euclidean'})
results.loc[results['classifier']=='NearestCentroid()', 'metric'] = results['param_estimator__metric']
results['ranking'] = results['rank_test_score'].astype(int)
pivoted_results = results.pivot(index=['classifier','n_neighbors','weights'], columns='metric', values=['mean_test_score', 'ranking'])
pivoted_results['ranking'] = pivoted_results['ranking'].astype(int)
display(pivoted_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean_test_score,mean_test_score,ranking,ranking
Unnamed: 0_level_1,Unnamed: 1_level_1,metric,euclidean,manhattan,euclidean,manhattan
classifier,n_neighbors,weights,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
KNeighborsClassifier(),2.0,distance,0.846567,0.846942,30,29
KNeighborsClassifier(),2.0,uniform,0.828451,0.830673,34,33
KNeighborsClassifier(),3.0,distance,0.860202,0.862164,21,17
KNeighborsClassifier(),3.0,uniform,0.860202,0.861208,21,20
KNeighborsClassifier(),4.0,distance,0.856111,0.858738,25,24
KNeighborsClassifier(),4.0,uniform,0.838223,0.838472,32,31
KNeighborsClassifier(),5.0,distance,0.872565,0.869718,1,5
KNeighborsClassifier(),5.0,uniform,0.870608,0.867872,4,6
KNeighborsClassifier(),6.0,distance,0.872324,0.872436,3,2
KNeighborsClassifier(),6.0,uniform,0.86736,0.865852,7,8


The F1-score is the highest for the k-NN with n_neighbors between 5 and 6. We will further analyse this special case. The NearestCentroid results lie in somewhere in between all k-NN results, so we will neglect the NearestCentroid classifier from now on. 

In [29]:
# set parameter grid
parameters = {
    'estimator': [KNeighborsClassifier()],
    'estimator__n_neighbors': range(5, 7),
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2]  # For Minkowski distance (1 for Manhattan, 2 for Euclidean)
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['n_neighbors'] = results['param_estimator__n_neighbors'].astype(str)
results['weights'] = results['param_estimator__weights'].astype(str)
results['metric'] = results['param_estimator__p'].replace({1:'manhattan', 2:'euclidean'})
pivoted_results = results.pivot(index=['n_neighbors','weights'], columns='metric', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc[('Average',''),:] = pivoted_results.mean()
display(pivoted_results)

Unnamed: 0_level_0,metric,euclidean,manhattan,Average
n_neighbors,weights,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,distance,0.872565,0.869718,0.871142
5,uniform,0.870608,0.867872,0.86924
6,distance,0.872324,0.872436,0.87238
6,uniform,0.86736,0.865852,0.866606
Average,,0.870714,0.868969,0.869842


The results for the k-NN Classifier are better when using distance-based weights. Also, the euclidean distance performs at lest as good as the manhatten distance in all cases. For that specific case we will do a nested cross validation for the two options of n_neighbors to receive more validated results:

In [30]:
from sklearn.model_selection import cross_val_score

# We use stratified 10 fold cross validation for the inner and the outer fold.
print('Nested Cross Validation Mean scores:')

# 5-NN
pipeline.set_params(estimator=KNeighborsClassifier(n_neighbors=5))
print('- 5-NN: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# 6-NN
pipeline.set_params(estimator=KNeighborsClassifier(n_neighbors=6))
print('- 6-NN: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

Nested Cross Validation Mean scores:
- 5-NN: 0.8706082253630099
- 6-NN: 0.8673598388248982


The 5-NN claassifier seems to perform slightly better than the one with n_neighbors=6.

# DecisionTree analysis

In [7]:
# set parameter grid
parameters = {
    'estimator': [DecisionTreeClassifier()],
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [None, 5, 10, 15],
    'estimator__min_samples_split': [2, 5, 10],
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['criterion'] = results['param_estimator__criterion'].astype(str)
results['max_depth'] = results['param_estimator__max_depth'].astype(str)
results['min_samples_split'] = results['param_estimator__min_samples_split']
pivoted_results = results.pivot(index=['criterion','max_depth'], columns='min_samples_split', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc[('Average',''),:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index=['criterion','max_depth'], columns='min_samples_split', values='mean_test_score')


Unnamed: 0_level_0,min_samples_split,2,5,10,Average
criterion,max_depth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
entropy,10.0,0.84072,0.847304,0.83834,0.842121
entropy,15.0,0.84959,0.846925,0.848481,0.848332
entropy,5.0,0.818769,0.822421,0.830701,0.823964
entropy,,0.848427,0.843824,0.839804,0.844018
gini,10.0,0.838776,0.833981,0.840979,0.837912
gini,15.0,0.855983,0.852236,0.844397,0.850872
gini,5.0,0.834397,0.838238,0.831217,0.834617
gini,,0.851603,0.851667,0.851221,0.851497
Average,,0.842283,0.842075,0.840643,0.841667


Observations:
- max_depth: choosing a max_depth of 5 works rather badly. The results of all other approaches are similar
- min_samples_split: On average, there are no real differences, 10 works slightly less good.
- criterion: For a larger max_depth, the top values can be achieved for the gini index.

We choose criterion='gini' (default), max_depth=15 and min_samples_split=2 (default) as the best parameter setting for a DecisionTreeClassifier.

# NearestCentroid vs. DecisionTree vs. NaiveBayes

In [32]:
# set parameter grid
parameters = {
    'estimator': [KNeighborsClassifier(n_neighbors=5),
                  DecisionTreeClassifier(max_depth=15),
                  GaussianNB()
                  ]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results[['param_estimator','mean_test_score','std_test_score','rank_test_score']])

Unnamed: 0,param_estimator,mean_test_score,std_test_score,rank_test_score
0,KNeighborsClassifier(),0.870608,0.025072,1
1,DecisionTreeClassifier(max_depth=15),0.855539,0.034747,2
2,GaussianNB(),0.84453,0.020799,3


The Grid Search cross validation shows a slight advantage for the 5-NN classifier, while the DecisionTreeClassifier and the NaiveBayes approach are second and third.
We will validate the results by performing a *nested* cross validation for each of the appraoches. 

In [33]:
# We use stratified 10 fold cross validation for the inner and the outer fold.
print('Nested Cross Validation Mean scores:')

# NearestCentroid
pipeline.set_params(estimator=KNeighborsClassifier(n_neighbors=5))
print('- 5-NN: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# NearestCentroid
pipeline.set_params(estimator=DecisionTreeClassifier(max_depth=15))
print('- DecisionTree: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# NearestCentroid
pipeline.set_params(estimator=GaussianNB())
print('- NaiveBayes: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

Nested Cross Validation Mean scores:
- 5-NN: 0.8706082253630099
- DecisionTree: 0.8499036019245818
- NaiveBayes: 0.8445295234345498


We see our results from before validated.

# Summary

Until now, we evaluated:
- k-NearestNeighbor
- NearestCentroid
- Decision Tree
- Naive Bayes

We've seen that out of these four model families, the k-NN Classifier with n_neighbors=5 performs the best with a F1-score of approx. 0.87

# Extended model evaluation

We will now compare the 5-NN Classifier to some more complex approaches, that aren't dealt with in detail in the lecture slides or the exercises.

## Neural network: Multi-Layer Perceptron

In [42]:
from sklearn.neural_network import MLPClassifier

# set parameter grid
parameters = {
    'estimator': [MLPClassifier()],
    'estimator__hidden_layer_sizes': [(50), (50, 50), (100, 50, 25), (100, 50, 25, 10)], # Specify the architecture of hidden layers
    'estimator__alpha': [0.0001, 0.001, 0.01], # L2 regularization term
    'estimator__max_iter': [300], # Increase number of gradient steps so that all combinations convert
    'estimator__random_state': [42],  # Random seed for reproducibility
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['hidden_layer_sizes'] = results['param_estimator__hidden_layer_sizes'].astype(str)
results['alpha'] = results['param_estimator__alpha']
pivoted_results = results.pivot(index='hidden_layer_sizes', columns='alpha', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='hidden_layer_sizes', columns='alpha', values='mean_test_score')


alpha,0.0001,0.001,0.01,Average
hidden_layer_sizes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(100, 50, 25)",0.853177,0.854997,0.855232,0.854469
"(100, 50, 25, 10)",0.844302,0.837158,0.838918,0.840126
"(50, 50)",0.840825,0.835053,0.841822,0.839233
50,0.855353,0.86954,0.864456,0.863116
Average,0.848414,0.849187,0.850107,0.849236


We see that using three (or generally more) hidden layers doesn't lead to a higher F1-Score. The best results can be achieved for a single hidden layer of size 50. In that case, the alpha parameter 0.001 produces the best results with a score of approx. 0.869.

## RandomForestClassifier

At first, we analyse the criterion and how the number of trees affects the result:

In [38]:
from sklearn.ensemble import RandomForestClassifier

# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'estimator__n_estimators': [10, 50, 100, 150, 200],  # Number of trees in the forest
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['criterion'] = results['param_estimator__criterion'].astype(str)
results['n_estimators'] = results['param_estimator__n_estimators']
pivoted_results = results.pivot(index='criterion', columns='n_estimators', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='criterion', columns='n_estimators', values='mean_test_score')


n_estimators,10,50,100,150,200,Average
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
entropy,0.861362,0.859515,0.858778,0.864078,0.864681,0.861683
gini,0.858932,0.867344,0.862367,0.863366,0.861971,0.862796
Average,0.860147,0.86343,0.860572,0.863722,0.863326,0.862239


We see that neither the criterion nor the n_estimators effectively changes the result. According to Occam's Razor we will take the combination ('gini', 50) as it produces the best result and is relatively simple. Now, we will use this pair to evaluate the effect of max_depth and min_samples_split.

In [39]:
# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['gini'],
    'estimator__n_estimators': [50],
    'estimator__max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'estimator__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# transform the results for better visualization
results['max_depth'] = results['param_estimator__max_depth']
results['min_samples_split'] = results['param_estimator__min_samples_split']
pivoted_results = results.pivot(index='max_depth', columns='min_samples_split', values='mean_test_score')
pivoted_results['Average'] = pivoted_results.mean(axis=1)
pivoted_results.loc['Average',:] = pivoted_results.mean()
display(pivoted_results)

  pivoted_results = results.pivot(index='max_depth', columns='min_samples_split', values='mean_test_score')


min_samples_split,2,5,10,Average
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.867344,0.867731,0.864299,0.866458
10.0,0.854969,0.856061,0.852554,0.854528
20.0,0.861069,0.860036,0.856868,0.859324
30.0,0.864439,0.863327,0.863582,0.863782
Average,0.861955,0.861789,0.859326,0.861023


The results don't differ much for different min_samples_split values, so we will keep the default value 2. A larger max_depth seams to increase the F1-score, but we will further analyse this now.

In [41]:
# set parameter grid
parameters = {
    'estimator': [RandomForestClassifier()],
    'estimator__criterion': ['gini'],
    'estimator__n_estimators': [50],
    'estimator__max_depth': [10, 20, 30, 40, 50, 60, None],  # Maximum depth of the trees
    'estimator__random_state': [42]
}

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results[['param_estimator__max_depth','mean_test_score','std_test_score','rank_test_score']])

Unnamed: 0,param_estimator__max_depth,mean_test_score,std_test_score,rank_test_score
0,10.0,0.854969,0.033631,7
1,20.0,0.861069,0.035544,6
2,30.0,0.864439,0.031431,5
3,40.0,0.867873,0.034423,1
4,50.0,0.867344,0.034928,2
5,60.0,0.867344,0.034928,2
6,,0.867344,0.034928,2


Above a max_depth of 40 the result cannot be improved any more. So, we will take max_depth=4 as the best parameter.

## Summary

We finally perform a nested cross validation for MLP and RandomForestClassifier to compare it to the best k-NN Classifier.

In [43]:
# We use stratified 10 fold cross validation for the inner and the outer fold.
print('Nested Cross Validation Mean scores:')

# 5-NN
pipeline.set_params(estimator=KNeighborsClassifier(n_neighbors=5))
print('- 5-NN: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# MLPClassifier
pipeline.set_params(estimator=MLPClassifier(hidden_layer_sizes=(50), alpha=0.001, max_iter=300, random_state=42))
print('- MLPClassifier: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

# RandomForestClassifier
pipeline.set_params(estimator=RandomForestClassifier(criterion='gini', n_estimators=50, max_depth=40, random_state=42))
print('- RandomForestClassifier: {}'.format(cross_val_score(pipeline, btvote_data, btvote_target, cv=stratified_10_fold_cv, scoring=f1).mean()))

Nested Cross Validation Mean scores:
- 5-NN: 0.8706082253630099
- MLPClassifier: 0.8695400521875598
- RandomForestClassifier: 0.8678727630920872


The 5-NN Classifier achieves the highest F1-Score with 0.87. Close behind we have the MLPClassifier and lastly the RandomForestClassifier.