In [112]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

## US Congressional Voting Records Dataset

In [94]:
# Loading voting dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', \
                    names=['party', 'infants', 'water', 'budget', 'physician', 'salvador', \
                           'religious', 'satellite', 'aid', 'missile', 'immigration', 'synfuels', \
                           'education', 'superfund', 'crime', 'duty_free_exports', 'eaa_rsa' \
                          ])
df_copy = df.copy()

# Binarize votes
df.replace(['n', '?'], 0, inplace=True)
df.replace('y', 1, inplace=True)
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,0
2,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1


## KNN

In [33]:
# Create arrays for the features and the response variable
y = df['party'].values
X = df.drop('party', axis=1).values

In [34]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
# Create a k-NN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)

In [39]:
# Fit the classifier to the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [40]:
# Predict the labels for the training data X
y_pred = knn.predict(X_train)

In [41]:
# Predict and print the label for the new data point X_new
new_prediction = knn.predict(X_test)

## Accuracy

In [42]:
# Accuracy on training set
accuracy_score(y_pred, y_train)

0.93965517241379315

In [43]:
# Accuracy on testing set
accuracy_score(new_prediction, y_test)

0.96551724137931039

## Dropping missing data

In [95]:
# Restore original dataframe
df = df_copy
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [97]:
# Datasets encode missing data in different ways. It may be a '9999', other times a 0 or '?'
# Convert '?' to NaN
df[df == '?'] = np.nan
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


In [98]:
# Binarize votes
df[df == 'n'] = 0
df[df == 'y'] = 1
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,0.0,1,0,1.0,1.0,1,0,0,0,1,,1.0,1,1,0,1.0
1,republican,0.0,1,0,1.0,1.0,1,0,0,0,0,0.0,1.0,1,1,0,
2,democrat,,1,1,,1.0,1,0,0,0,0,1.0,0.0,1,1,0,0.0
3,democrat,0.0,1,1,0.0,,1,0,0,0,0,1.0,0.0,1,0,0,1.0
4,democrat,1.0,1,1,0.0,1.0,1,0,0,0,0,1.0,,1,1,1,1.0


In [99]:
# Print the number of NaNs
df.isnull().sum()

party                  0
infants               12
water                 48
budget                11
physician             11
salvador              15
religious             11
satellite             14
aid                   15
missile               22
immigration            7
synfuels              21
education             31
superfund             25
crime                 17
duty_free_exports     28
eaa_rsa              104
dtype: int64

In [100]:
# Print shape of original DataFrame
df.shape

(435, 17)

In [101]:
# Drop missing values and print shape of new DataFrame
df = df.dropna()

In [102]:
# Shape of DataFrame After Dropping All Rows with Missing Values
df.shape

(232, 17)

## Imputing missing data in Pipeline

In [103]:
# Setup the Imputation transformer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

In [115]:
# Instantiate the KNN classifier
knn = KNeighborsClassifier()

In [116]:
# Setup the pipeline with the required steps
steps = [('imputation', imp),
        ('KNN', knn)]

In [117]:
# Create the pipeline
pipeline = Pipeline(steps)

In [118]:
# Create arrays for the features and the response variable
y = df['party']
X = df.drop('party', axis=1)

In [123]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [124]:
# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

Pipeline(steps=[('imputation', Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
    verbose=0)), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [125]:
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

In [126]:
# Compute metrics
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

   democrat       0.97      0.85      0.90        33
 republican       0.88      0.97      0.92        37

avg / total       0.92      0.91      0.91        70

