In [1]:
# mount data from google drive
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/MyDrive/VoteBrain'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/VoteBrain


In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [3]:
btvote = pd.read_pickle('data/btvote.pkl')
btvote.head()

Unnamed: 0,party,vote_19001,vote_19002,vote_19003,vote_19004,vote_19005,vote_19006,vote_19007,vote_19008,vote_19009,...,vote_19235,vote_19236,vote_19237,vote_19238,vote_19239,vote_19240,vote_19241,vote_19242,vote_19243,vote_19244
0,CDU,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,SPD,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,,,
2,Linke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
3,CDU,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,
4,Linke,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,


# NaN handling

### Delete full NaN groups
Delete all votes where all values of one party are NaN

In [4]:
unique_groups = btvote['party'].unique()
for group_value in unique_groups:
    group_df = btvote[btvote['party'] == group_value]
    nan_columns = group_df.columns[group_df.isna().all()]
    for column in nan_columns:
        print(column, end=' ')
    btvote = btvote.drop(nan_columns, axis=1)

vote_19042 vote_19047 vote_19048 vote_19051 vote_19054 vote_19060 vote_19072 vote_19080 vote_19097 vote_19106 vote_19132 vote_19151 vote_19152 vote_19162 vote_19194 vote_19207 vote_19219 vote_19061 vote_19077 vote_19095 vote_19100 vote_19103 vote_19200 vote_19221 vote_19224 vote_19228 vote_19079 vote_19085 vote_19123 vote_19143 vote_19144 vote_19211 vote_19182 

### Data Enrichment
Imputing of missing values (NaN)

- sklearn [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer) with strategies *mean* and *mode/most_frequent*
- sklearn [KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer) with different *n_neighbors*
- sklearn **custom** GroupImputer with strategies *mean* and *mode*:

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a custom Imputer that imputes based on the party
class GroupImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy

    def fit(self, X, y):
        # store the y series to find the correct index in the transform function
        self.y = y
        # self.group_values becomes an attribute that is the map of group_specific values
        if self.strategy == 'mode':
            self.group_values = X.groupby(self.y).agg(lambda x: pd.Series.mode(x)[0])
        else:
            self.group_values = X.groupby(self.y).mean()
        return self

    def transform(self, X):
        for column in X.columns:
            if X[column].isna().any():
                # apply to all columns that contain any NaN values:
                # if the current cell in the DataFrame is a NaN value, return the group-specific value for that cell
                # othwerwise return the original value
                X[column] = X.apply(
                    lambda row: self.group_values.loc[self.y[row.name],column] if np.isnan(row[column])
                                   else row[column], axis=1)
        return X.to_numpy()

# Split data and encode target variable

In [6]:
# Split dataframe in 'data' and 'target'
btvote_data = btvote.drop('party', axis=1)
btvote_target = btvote['party']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
btvote_target = label_encoder.fit_transform(btvote_target)

# Pipeline and GridSearch setup

In [22]:
from sklearn.pipeline import Pipeline
# imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Pipeline
# 1. Imputing NaN values
# 2. Balancing?
# 3. Classification
pipeline = Pipeline([('imputer', None), ('estimator', None)]) # do we need balancing?

# set parameters
# TODO: extend parameter field by Classifiers. This is just an example of how the pipeline and GridSearch work on our data
parameters = [
    {
        'imputer': [SimpleImputer(), KNNImputer(), GroupImputer(strategy='mean'), GroupImputer(strategy='mode')],
        'estimator': [KNeighborsClassifier()],
        'estimator__n_neighbors': range(2, 8),
    }, {
        'imputer': [SimpleImputer(), KNNImputer(), GroupImputer(strategy='mean'), GroupImputer(strategy='mode')],
        'estimator': [DecisionTreeClassifier()],
        'estimator__max_depth': [3,6],
    }
]

In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring='accuracy', cv=stratified_10_fold_cv)

# run the grid search
# TODO: maybe use nested cross validation in the future
grid_search_estimator.fit(btvote_data,btvote_target)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__n_neighbors,param_imputer,param_estimator__max_depth,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006246,0.001641,0.008732,0.004052,KNeighborsClassifier(),2.0,SimpleImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.813333,0.76,0.810811,0.864865,0.851351,0.851351,0.864865,0.838991,0.032425,9
1,0.006297,0.00058,0.008949,0.001698,KNeighborsClassifier(),2.0,KNNImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.813333,0.76,0.810811,0.864865,0.851351,0.851351,0.864865,0.838991,0.032425,9
2,0.023244,0.001338,0.034061,0.024198,KNeighborsClassifier(),2.0,GroupImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.813333,0.76,0.810811,0.864865,0.851351,0.851351,0.864865,0.838991,0.032425,9
3,0.296059,0.013845,0.026944,0.002313,KNeighborsClassifier(),2.0,GroupImputer(strategy='mode'),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.813333,0.76,0.810811,0.864865,0.851351,0.851351,0.864865,0.838991,0.032425,9
4,0.006309,0.001995,0.007177,0.000406,KNeighborsClassifier(),3.0,SimpleImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.8,0.8,0.810811,0.891892,0.824324,0.810811,0.891892,0.84164,0.035304,5
5,0.00621,0.00077,0.009925,0.006809,KNeighborsClassifier(),3.0,KNNImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.8,0.8,0.810811,0.891892,0.824324,0.810811,0.891892,0.84164,0.035304,5
6,0.031068,0.021815,0.025669,0.00094,KNeighborsClassifier(),3.0,GroupImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.8,0.8,0.810811,0.891892,0.824324,0.810811,0.891892,0.84164,0.035304,5
7,0.296772,0.025778,0.042842,0.033148,KNeighborsClassifier(),3.0,GroupImputer(strategy='mode'),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.853333,...,0.8,0.8,0.810811,0.891892,0.824324,0.810811,0.891892,0.84164,0.035304,5
8,0.006174,0.000568,0.0076,0.000456,KNeighborsClassifier(),4.0,SimpleImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.866667,...,0.813333,0.84,0.797297,0.824324,0.797297,0.810811,0.905405,0.834847,0.035342,13
9,0.005906,0.000156,0.007471,0.000194,KNeighborsClassifier(),4.0,KNNImputer(),,"{'estimator': KNeighborsClassifier(), 'estimat...",0.866667,...,0.813333,0.84,0.797297,0.824324,0.797297,0.810811,0.905405,0.834847,0.035342,13


# Archive

In [10]:

# impute the values based on the most frequent value
# TODO: group most frequent value per party
"""grouped_transformed = transformed.groupby(['party_text'], group_keys = False)
#display(grouped_transformed)

imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

def impute_most_frequent(vote):
    most_frequent_value = grouped_transformed[vote].mode().iloc[0]  # Calculate the most frequent value within the group
    grouped_transformed[vote].fillna(most_frequent_value, inplace=True)  # Fill missing values with the most frequent value
    return grouped_transformed

# Apply the impute_most_frequent function to each group and concatenate the results
parties = transformed['party_text'].unique()
print(parties)

df = transformed['party_text'] == parties[0]

display(df)

for vote in transformed.columns:
    if vote != 'mp_id' and vote != 'party_text':
        most_frequent_value = transformed[vote].mode().iloc[0]
        print(most_frequent_value)
#grouped_transformed.apply(impute_most_frequent)
"""
"""imp = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
idf = pd.DataFrame(imp.fit_transform(transformed))
idf.columns = transformed.columns
idf.index = transformed.index
display(idf)"""



# make use of a pipeline? Maybe for balancing and encoding? Scaling shouldn't be required, at least for the vote results

"imp = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')\nidf = pd.DataFrame(imp.fit_transform(transformed))\nidf.columns = transformed.columns\nidf.index = transformed.index\ndisplay(idf)"

In [11]:
"""import mlmachine as mlm
from mlmachine.features.preprocessing import GroupbyImputer

# instantiate GroupbyImputer to fill "Age" mean, grouped by "SibSp"
impute = GroupbyImputer(null_column="vote_19001", groupby_column="party_text", strategy="most_frequent")
impute.fit_transform(transformed[["vote_19001","party_text"]])
display(impute.train_value)"""

# Replace cheq_balance NaN with mean cheq_balance of same state
#transformed['vote_19001'] = transformed.groupby('party_text').vote_19001.transform(lambda x: x.fillna(x.mean()))

'import mlmachine as mlm\nfrom mlmachine.features.preprocessing import GroupbyImputer\n\n# instantiate GroupbyImputer to fill "Age" mean, grouped by "SibSp"\nimpute = GroupbyImputer(null_column="vote_19001", groupby_column="party_text", strategy="most_frequent")\nimpute.fit_transform(transformed[["vote_19001","party_text"]])\ndisplay(impute.train_value)'