# Predicting Presidents
### UTK ML, Fall 2020
Building machine learning models to predict results from the 2016 US Presidential election.



In [15]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, neighbors, metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Preprocess data:
Train = pd.read_csv("presidential_election_2016.csv")
Train.fillna(method = "ffill", inplace = True) # Fill na values
    
# 0 = democrats, 1 = republicans

trim = Train.iloc[:, 4:] # Leave off the first 4 columns
trim.drop("Winner", axis = 1, inplace = True) # Leave off the last column (labels)

# Fill in NA values
trim.fillna(method = "ffill", inplace = True)
trim.fillna(method = "bfill", inplace = True)

# Normalize the data:
data = preprocessing.scale(trim)

#Get labels list:
y = Train.loc[:, "Winner"]

x_train, x_test, y_train, y_test = train_test_split(data, y, test_size = 0.3)

## kNN Classifier
- Classifying points based on nearest neighbors
- The "neighborhood" determined by how many points you want to consider (k)
- Non-parameterized model: no equation to learn per se
- Computationally expensive

In [21]:
# kNN
model = neighbors.KNeighborsClassifier(n_neighbors = 3)
model.fit(x_train, y_train)

# Verify the model
y_pred = model.predict(x_test)
    
#metrics.confusion_matrix(y_test, y_pred, labels = [0, 1])
print("kNN Score:", metrics.accuracy_score(y_test, y_pred))

kNN Score: 0.9260450160771704


## Decision Tree
- Decision splits classify data
- Training systematically improves the questions being asked
- Works easily with categorical data; can also work with numerical data (regression trees)

In [22]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

print("Decision Tree Score:", dt_model.score(x_test, y_test))

Decision Tree Score: 0.9088960342979635


## Random Forest
- Modification of decision trees - more sophisticated
- Algorithm:
    1. Create n random samples of dataset (with replacement)
    2. Train n separate trees
    3. When predicting, predict on all trees, take majority vote
- Variation in the trees produces better results

In [23]:
# Random Forest Classifier
rf_model = RandomForestClassifier(max_depth = 5)
rf_model.fit(x_train, y_train)

print("Random Forest Score:", rf_model.score(x_test, y_test))

Random Forest Score: 0.92497320471597
