In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
btvote = pd.read_pickle('../data/btvote.pkl')
btvote.head()

Unnamed: 0,party,vote_19001,vote_19002,vote_19003,vote_19004,vote_19005,vote_19006,vote_19007,vote_19008,vote_19009,...,vote_19235,vote_19236,vote_19237,vote_19238,vote_19239,vote_19240,vote_19241,vote_19242,vote_19243,vote_19244
0,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,yes
1,SPD,,,,,,,,,,...,yes,yes,yes,yes,yes,yes,no,absence,absence,absence
2,Linke,no,no,no,no,no,no,no,no,yes,...,no,no,no,no,no,no,no,abstain,no,no
3,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,absence
4,Linke,absence,absence,absence,absence,absence,absence,absence,absence,absence,...,no,no,no,no,no,no,no,abstain,no,absence


# Split data and encode target variable

In [3]:
# Split dataframe in 'data' and 'target'
btvote_data = btvote.drop('party', axis=1)
btvote_target = btvote['party']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
btvote_target = label_encoder.fit_transform(btvote_target)

# Pipeline and GridSearch setup

In the pipeline we include the SimpleImputer with the strategy 'most_frequent' as a result of the experiments in the previous notebook.\
As an estimator, we consider:
- nearest-neighbor
- Decision Tree
- Naive Bayes
- NearestCentroid
- KMeans

In [4]:
from imblearn.pipeline import Pipeline
# normalisation
from sklearn.preprocessing import OneHotEncoder
# imputer
from sklearn.impute import SimpleImputer
# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.cluster import KMeans

# Pipeline
pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)), ('estimator', None)])

# set parameter grid
parameters = [
    {
        'estimator': [KNeighborsClassifier()],
        'estimator__n_neighbors': range(2, 8),
    }, {
        'estimator': [DecisionTreeClassifier()],
        'estimator__max_depth': [3,6],
    }, {
        'estimator': [KMeans()],
        'estimator__n_clusters': [7],
        'estimator__n_init': [15],
    }, {
        'estimator': [NearestCentroid()],
        'estimator__metric': ['euclidean'],
    }, {
        'estimator': [GaussianNB()],
    }
]

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# define the scoring function
# Note: We use 'macro' as average as we want to evaluate the performance of each class equally, regardless of the class size
f1 = make_scorer(f1_score, average='macro')

# Model evaluation

In [7]:
# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__n_neighbors,param_estimator__max_depth,param_estimator__n_clusters,param_estimator__n_init,param_estimator__metric,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.062098,0.005351,0.022003,0.000448,KNeighborsClassifier(),2.0,,,,,...,0.677551,0.660556,0.67876,0.66416,0.743878,0.63663,0.695054,0.675808,0.028871,6
1,0.060795,0.001169,0.021999,0.000772,KNeighborsClassifier(),3.0,,,,,...,0.716298,0.682216,0.633272,0.708248,0.68029,0.71396,0.665393,0.695739,0.030688,4
2,0.0618,0.002749,0.0219,0.000545,KNeighborsClassifier(),4.0,,,,,...,0.661579,0.64502,0.669437,0.669587,0.699621,0.659211,0.681232,0.672698,0.01736,8
3,0.06624,0.005632,0.023151,0.002214,KNeighborsClassifier(),5.0,,,,,...,0.681679,0.661173,0.68736,0.72197,0.744898,0.692279,0.734074,0.697228,0.027202,3
4,0.076944,0.007644,0.02474,0.002207,KNeighborsClassifier(),6.0,,,,,...,0.677921,0.655311,0.687441,0.675824,0.657889,0.689549,0.675406,0.67196,0.014154,9
5,0.064202,0.008906,0.022496,0.001193,KNeighborsClassifier(),7.0,,,,,...,0.649924,0.651852,0.713415,0.68355,0.685123,0.659211,0.698001,0.673475,0.021963,7
6,0.0742,0.006426,0.013996,0.0011,DecisionTreeClassifier(),,3.0,,,,...,0.45553,0.472383,0.457506,0.452103,0.506376,0.476897,0.464894,0.471459,0.018908,10
7,0.078183,0.003465,0.015081,0.003048,DecisionTreeClassifier(),,6.0,,,,...,0.676036,0.667993,0.687499,0.676347,0.675914,0.655532,0.686373,0.682709,0.019031,5
8,0.242243,0.021643,0.023744,0.00138,KMeans(),,,7.0,15.0,,...,0.487275,0.010582,0.090703,0.014652,0.096931,0.312925,0.246753,0.181674,0.145994,11
9,0.063616,0.008602,0.015194,0.002443,NearestCentroid(),,,,,euclidean,...,0.719579,0.721761,0.689852,0.695558,0.699585,0.752337,0.77087,0.717709,0.024862,1
