In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
btvote = pd.read_pickle('../data/btvote.pkl')
btvote.head()

Unnamed: 0,party,vote_19001,vote_19002,vote_19003,vote_19004,vote_19005,vote_19006,vote_19007,vote_19008,vote_19009,...,vote_19235,vote_19236,vote_19237,vote_19238,vote_19239,vote_19240,vote_19241,vote_19242,vote_19243,vote_19244
0,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,yes
1,SPD,,,,,,,,,,...,yes,yes,yes,yes,yes,yes,no,absence,absence,absence
2,Linke,no,no,no,no,no,no,no,no,yes,...,no,no,no,no,no,no,no,abstain,no,no
3,CDU,yes,yes,yes,yes,yes,yes,yes,yes,no,...,yes,yes,yes,yes,yes,yes,no,yes,yes,absence
4,Linke,absence,absence,absence,absence,absence,absence,absence,absence,absence,...,no,no,no,no,no,no,no,abstain,no,absence


# Split data and encode target variable

In [3]:
# Split dataframe in 'data' and 'target'
btvote_data = btvote.drop('party', axis=1)
btvote_target = btvote['party']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
btvote_target = label_encoder.fit_transform(btvote_target)

# Pipeline and GridSearch setup

In the pipeline we include the SimpleImputer with different strategies as well as the KNNImputer with number of neighbors between 2 and 9.\
For Balancing, we just the RandomOverSampler at the moment. In a later notebook, the Balancing will be evaluated in detail.\
As an estimator, we consider nearest-neighbor, Decision Tree and Naive Bayes. Again, later more models will be analysed in detail. For the moment, this selection of estimators should just ensure well-founded results for the different Imputing methods

In [4]:
from imblearn.pipeline import Pipeline
# normalisation
from sklearn.preprocessing import OneHotEncoder
# imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
# balancing
from imblearn.over_sampling import RandomOverSampler
# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Pipeline
pipeline = Pipeline([('imputer', None), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)), ('balancing', RandomOverSampler()), ('estimator', None)])

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# define the scoring function
# Note: As we use Balancing the micro average will equal the macro average
f1 = make_scorer(f1_score, average='micro')

# NaN handling

### Data Enrichment
Definition of NaN
1. Only actual NaNs are considered as NaNs, 'abstain' and 'absence' both as separate values
2. Consider actual NaNs and 'absence' as NaN, keep 'abstain' as possible value\
    a. encode 'no', 'abstain' and 'yes' using OneHotEncoder in pipeline\
    b. encode 'no', 'abstain' and 'yes' ordinal
3. Consider only 'yes' and 'no' as allowed values

Imputing of missing values (NaN)

- sklearn [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer) with strategies *mean*, *most_frequent* and *constant*
- sklearn [KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer) with different *n_neighbors*

### 1. 'abstain' and 'absence' as separate values

In this case, the SimpleImputer with strategy 'mean' can't be used, because is requires numeric data.
We cannot encode the voting behavior into numeric format as there is no order between 'yes', 'no', 'abstain' and 'absence'

In [None]:
# define parameter grid
parameters = [
    {
        'imputer': [SimpleImputer(strategy='most_frequent'), SimpleImputer(strategy='constant')],
        'estimator': [KNeighborsClassifier(n_neighbors=7), DecisionTreeClassifier(max_depth=5), GaussianNB()],
    }, {
        'imputer': [KNNImputer()],
        'imputer__n_neighbors': range(2,10),
        'estimator': [KNeighborsClassifier(n_neighbors=7), DecisionTreeClassifier(max_depth=5), GaussianNB()],
    }
]

In [76]:
btvote_data_1 = btvote_data.copy()

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data_1, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# pivot the results for better visualization
results['param_imputer'] = results['param_imputer'].astype(str)
results['param_estimator'] = results['param_estimator'].astype(str)
pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')
pivoted_results['Average'] = pivoted_results[['DecisionTreeClassifier(max_depth=5)','GaussianNB()','KNeighborsClassifier(n_neighbors=7)']].mean(axis=1)
display(pivoted_results)

  pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')


Unnamed: 0_level_0,param_estimator,DecisionTreeClassifier(max_depth=5),GaussianNB(),KNeighborsClassifier(n_neighbors=7),Mean
param_imputer,param_imputer__n_neighbors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNImputer(n_neighbors=2),2.0,0.569946,0.681874,0.616126,0.622649
KNNImputer(n_neighbors=2),3.0,0.592288,0.679171,0.62155,0.631003
KNNImputer(n_neighbors=2),4.0,0.60418,0.675171,0.617586,0.632312
KNNImputer(n_neighbors=2),5.0,0.602523,0.679189,0.616126,0.632613
KNNImputer(n_neighbors=2),6.0,0.592,0.679171,0.622937,0.631369
KNNImputer(n_neighbors=2),7.0,0.557027,0.675135,0.61218,0.614781
KNNImputer(n_neighbors=2),8.0,0.586667,0.679171,0.62164,0.629159
KNNImputer(n_neighbors=2),9.0,0.65391,0.677802,0.617532,0.649748
SimpleImputer(strategy='constant'),,0.57845,0.677838,0.622973,0.62642
SimpleImputer(strategy='most_frequent'),,0.651243,0.679171,0.618757,0.649724


We see, that there is no real difference in performance between the imputing methods. If anything, the KNNImputer with n_neighbors between 3 and 4 works a bit better.

### 2. 'absence' is NaN, 'abstain' as allowed values

#### a. encode 'no', 'abstain' and 'yes' using OneHotEncoder in pipeline

The evaluation works the same as for the case before. We just convert all 'absence' values to NaN before executing the grid search.

In [78]:
btvote_data_2 = btvote_data.replace('absence', np.nan)

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data_2, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# pivot the results for better visualization
results['param_imputer'] = results['param_imputer'].astype(str)
results['param_estimator'] = results['param_estimator'].astype(str)
pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')
pivoted_results['Average'] = pivoted_results[['DecisionTreeClassifier(max_depth=5)','GaussianNB()','KNeighborsClassifier(n_neighbors=7)']].mean(axis=1)
display(pivoted_results)

  pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')


Unnamed: 0_level_0,param_estimator,DecisionTreeClassifier(max_depth=5),GaussianNB(),KNeighborsClassifier(n_neighbors=7),Mean
param_imputer,param_imputer__n_neighbors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNImputer(n_neighbors=2),2.0,0.596306,0.634811,0.626847,0.619321
KNNImputer(n_neighbors=2),3.0,0.576757,0.634793,0.622793,0.611447
KNNImputer(n_neighbors=2),4.0,0.581261,0.633459,0.625532,0.613417
KNNImputer(n_neighbors=2),5.0,0.570973,0.633441,0.613351,0.605922
KNNImputer(n_neighbors=2),6.0,0.563964,0.633459,0.618811,0.605411
KNNImputer(n_neighbors=2),7.0,0.586631,0.633459,0.626919,0.61567
KNNImputer(n_neighbors=2),8.0,0.597045,0.632108,0.618757,0.61597
KNNImputer(n_neighbors=2),9.0,0.622973,0.633441,0.617441,0.624619
SimpleImputer(),,0.629892,0.634793,0.626901,0.630529
SimpleImputer(strategy='constant'),,0.563694,0.633459,0.618811,0.605321


We see again, that there is no clear difference in performance between the different imputing methods. Overall, the mean results are a bit lower than in experiment 01. where we included both 'abstain' and 'absence'.

#### b. encode 'no', 'abstain' and 'yes' ordinal

Now we define a new pipeline without the OneHotEncoder. Instead we encode the values manually before executing the grid search: {'no':0, 'abstain':0.5, 'yes':1}

In this case, all imputers listed above can be used, as the voting behavior is in numeric format. That is why we redefine the parameter grid.

In [11]:
# redefine parameter grid
parameters = [
    {
        'imputer': [SimpleImputer(strategy='mean'), SimpleImputer(strategy='most_frequent'), SimpleImputer(strategy='constant')],
        'estimator': [KNeighborsClassifier(n_neighbors=7), DecisionTreeClassifier(max_depth=5), GaussianNB()],
    }, {
        'imputer': [KNNImputer()],
        'imputer__n_neighbors': range(2,10),
        'estimator': [KNeighborsClassifier(n_neighbors=7), DecisionTreeClassifier(max_depth=5), GaussianNB()],
    }
]

In [13]:
# ordinal pipeline without OneHotEncoder
ordinal_pipeline = Pipeline([('imputer', None), ('balancing', RandomOverSampler()), ('estimator', None)])

# transform data: 'abstain' ordinal positioned between yes and no
btvote_data_2b = btvote_data.replace('absence', np.nan)
btvote_data_2b = btvote_data_2b.replace('no', 0)
btvote_data_2b = btvote_data_2b.replace('abstain', 0.5)
btvote_data_2b = btvote_data_2b.replace('yes', 1)

# create the grid search instance
grid_search_estimator = GridSearchCV(ordinal_pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data_2b, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# pivot the results for better visualization
results['param_imputer'] = results['param_imputer'].astype(str)
results['param_estimator'] = results['param_estimator'].astype(str)
pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')
pivoted_results['Average'] = pivoted_results[['DecisionTreeClassifier(max_depth=5)','GaussianNB()','KNeighborsClassifier(n_neighbors=7)']].mean(axis=1)
display(pivoted_results)

  pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')


Unnamed: 0_level_0,param_estimator,DecisionTreeClassifier(max_depth=5),GaussianNB(),KNeighborsClassifier(n_neighbors=7),Average
param_imputer,param_imputer__n_neighbors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNImputer(n_neighbors=2),2.0,0.586396,0.582486,0.724757,0.631213
KNNImputer(n_neighbors=2),3.0,0.570468,0.551676,0.716739,0.612961
KNNImputer(n_neighbors=2),4.0,0.597063,0.585261,0.710108,0.630811
KNNImputer(n_neighbors=2),5.0,0.566685,0.582595,0.703387,0.617556
KNNImputer(n_neighbors=2),6.0,0.542432,0.575928,0.704685,0.607682
KNNImputer(n_neighbors=2),7.0,0.564883,0.571892,0.711514,0.616096
KNNImputer(n_neighbors=2),8.0,0.549081,0.578703,0.708703,0.612162
KNNImputer(n_neighbors=2),9.0,0.56227,0.574649,0.707351,0.614757
SimpleImputer(),,0.577063,0.650901,0.619964,0.615976
SimpleImputer(strategy='constant'),,0.569009,0.528973,0.632198,0.576727


The performance isn't much different to 2a. The SimpleImputer with strategy 'most_frequent' performs relatively good.

### 3. only 'yes' and 'no' are allowed values

Again, all imputers listed above can be used, as the voting behavior is in numeric format.

In [81]:
btvote_data_3 = btvote_data.replace(['absence','abstain'], np.nan)

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring=f1, cv=stratified_10_fold_cv, error_score='raise')

# run the grid search
grid_search_estimator.fit(btvote_data_3, btvote_target)

# results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)

# pivot the results for better visualization
results['param_imputer'] = results['param_imputer'].astype(str)
results['param_estimator'] = results['param_estimator'].astype(str)
pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')
pivoted_results['Average'] = pivoted_results[['DecisionTreeClassifier(max_depth=5)','GaussianNB()','KNeighborsClassifier(n_neighbors=7)']].mean(axis=1)
display(pivoted_results)

  pivoted_results = results.pivot(index=['param_imputer','param_imputer__n_neighbors'], columns='param_estimator', values='mean_test_score')


Unnamed: 0_level_0,param_estimator,DecisionTreeClassifier(max_depth=5),GaussianNB(),KNeighborsClassifier(n_neighbors=7),Average
param_imputer,param_imputer__n_neighbors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNImputer(n_neighbors=6),2.0,0.574324,0.579838,0.622865,0.592342
KNNImputer(n_neighbors=6),3.0,0.626937,0.574468,0.612144,0.604517
KNNImputer(n_neighbors=6),4.0,0.60964,0.575838,0.621459,0.602312
KNNImputer(n_neighbors=6),5.0,0.609279,0.574486,0.62018,0.601315
KNNImputer(n_neighbors=6),6.0,0.644018,0.577171,0.62818,0.616456
KNNImputer(n_neighbors=6),7.0,0.617946,0.57582,0.618721,0.604162
KNNImputer(n_neighbors=6),8.0,0.586901,0.577189,0.620144,0.594745
KNNImputer(n_neighbors=6),9.0,0.593063,0.577207,0.625622,0.598631
SimpleImputer(),,0.544,0.57582,0.618793,0.579538
SimpleImputer(strategy='constant'),,0.643045,0.577171,0.629459,0.616559


There is no clear difference in performance between the different imputing methods, but the average results are even lower than in 2a and 2b.

# Conclusion
We've seen that the performance of the classification doesn't really differ for the imputing methods from sklearn. The KNNImputer gave a different ranking between the n_neighbors considered when reexecuting the grid search. The SimpleImputer with the strategy 'most_frequent' seemed the most stable for several reexecutions. So from now on we will use this as our imputer.

We simultaniously evaluated different input formats. We got a clear result, that using 'abstain' and 'absence' as possible values for the input data generally leads to a higher F1-Score. Thus, we will use this input format in the future experiments. 

# Archive
GroupImputer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a custom Imputer that imputes based on the party
class GroupImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy

    def fit(self, X, y):
        # store the y series to find the correct index in the transform function
        self.y = y
        # self.group_values becomes an attribute that is the map of group_specific values
        if self.strategy == 'mode':
            self.group_values = X.groupby(self.y).agg(lambda x: pd.Series.mode(x)[0])
        else:
            self.group_values = X.groupby(self.y).mean()
        return self

    def transform(self, X):
        for column in X.columns:
            if X[column].isna().any():
                # apply to all columns that contain any NaN values:
                # if the current cell in the DataFrame is a NaN value, return the group-specific value for that cell
                # othwerwise return the original value
                X[column] = X.apply(
                    lambda row: self.group_values.loc[self.y[row.name],column] if np.isnan(row[column])
                                   else row[column], axis=1)
        return X.to_numpy()