In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.figure_factory._county_choropleth import create_choropleth

In [None]:
# load data
data = pd.read_csv("merged_train.csv")
data.head()

In [None]:
# task 1 
# Partition dataset into training, validation sets using holdout method 75/25 split

# Xvariables has all the names except for State, County, FIPS, Party, Democratic, Republican
# Yvariables has Party, Democratic, Republican
Yvariables = ['Party','Democratic','Republican']
Xvariables = ['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born','Percent Female','Percent Age 29 and Under','Percent Age 65 and Older','Median Household Income','Percent Unemployed','Percent Less than High School Degree','Percent Less than Bachelor\'s Degree','Percent Rural']
X_train, X_val, Y_train, Y_val = train_test_split(data[Xvariables], data[Yvariables], test_size = 0.25, random_state = 1)
X_train.head()

In [None]:
# task 2
# standardize the training and validation sets by using X_train as the scalar and applying to the training and validation sets
scaler = StandardScaler()
scaler.fit(X_train) # find the mean and standard diviation for the columns in X_train
x_train_scaled = scaler.transform(X_train) # scales X_train using the results from fit method
x_val_scaled = scaler.transform(X_val)  # scales X_val using the results from fit method
x_train_scaled

In [None]:
# task 3
# predict democratic votes using 2 predictors
model = linear_model.LinearRegression()
fitted_model = model.fit(X=X_train[['Percent Foreign Born', 'Median Household Income']],y=Y_train['Democratic'])
predicted = fitted_model.predict(X_train[['Percent Foreign Born','Median Household Income']])
print (predicted)

# Evaluate linear regression model using 2 predictors on democratic votes
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_train['Democratic'])
score_val = model.score(X = X_val_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_val['Democratic']) # R squared (validation)
print(score_val)

# Evaluate LASSO regression model on democratic votes
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train['Democratic'])
score_val = model.score(X = X_val_dummy, y = Y_val['Democratic']) # R squared (validation)
print(score_val)

# predict republican votes using 2 predictors
model = linear_model.LinearRegression()
fitted_model = model.fit(X=X_train[['Percent Foreign Born', 'Median Household Income']],y=Y_train['Republican'])
predicted = fitted_model.predict(X_train[['Percent Foreign Born', 'Median Household Income']])
print (predicted)

# Evaluate linear regression model using 2 predictors on republican votes
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_train['Republican'])
score_val = model.score(X = X_val_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_val['Republican']) # R squared (validation)
print(score_val)

# Evaluate LASSO regression model on republican votes
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train['Republican'])
score_val = model.score(X = X_val_dummy, y = Y_val['Republican']) # R squared (validation)
print(score_val)

# it seems like lasso regression is better

In [None]:
# task 4
# Partitioning the data into training and validation using only variables that are stronger predictors
X = ['Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino', 'Percent Hispanic or Latino', 'Percent Age 29 and Under', 'Percent Age 65 and Older', 'Percent Less than High School Degree', 'Percent Less than Bachelor\'s Degree', 'Percent Rural']
Y = ['Party']
X_train_class, X_val_class, Y_train_class, Y_val_class = train_test_split(data[X], data[Y], test_size=0.25, random_state=0)
scaler.fit(X_train_class)
x_train_scaled_class = scaler.transform(X_train_class)
x_val_scaled_class = scaler.transform(X_val_class)

In [None]:
# task 4 - continuation
# Build a classification model to classify each county as Democratic or Republican
# kNN classifier
classifier = KNeighborsClassifier(n_neighbors=5, weights='uniform')  
classifier.fit(x_train_scaled_class, Y_train_class.values.ravel())

In [None]:
y_pred_class = classifier.predict(x_val_scaled_class)

In [None]:
conf_matrix = metrics.confusion_matrix(Y_val_class, y_pred_class)
print(conf_matrix)

In [None]:
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(Y_val_class, y_pred_class)
error = 1 - accuracy
precision = metrics.precision_score(Y_val_class, y_pred_class, average=None)
recall = metrics.recall_score(Y_val_class, y_pred_class, average=None)
F1_score = metrics.f1_score(Y_val_class, y_pred_class, average=None)
print("Results of kNN classifier\n")
print("Accuracy: " + str(accuracy) + "\n" + "Error: " + str(error) + "\n" + "Precision: " + str(precision) + "\n" + "Recall: " + str(recall) + "\n" + "F1 score: " + str(F1_score))

In [None]:
# task 4 - continuation
# SVM Classifier
classifier = SVC(kernel='rbf')
classifier.fit(x_train_scaled_class, Y_train_class.values.ravel())

svm_class = classifier

In [None]:
y_pred_class = classifier.predict(x_val_scaled_class)

In [None]:
conf_matrix = metrics.confusion_matrix(Y_val_class, y_pred_class)
print(conf_matrix)

In [None]:
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(Y_val_class, y_pred_class)
error = 1 - accuracy
precision = metrics.precision_score(Y_val_class, y_pred_class, average=None)
recall = metrics.recall_score(Y_val_class, y_pred_class, average=None)
F1_score = metrics.f1_score(Y_val_class, y_pred_class, average=None)
print("Results of SVM classifier\n")
print("Accuracy: " + str(accuracy) + "\n" + "Error: " + str(error) + "\n" + "Precision: " + str(precision) + "\n" + "Recall: " + str(recall) + "\n" + "F1 score: " + str(F1_score))

In [None]:
# task 4
# Decision trees
classifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
classifier.fit(x_train_scaled_class, Y_train_class.values.ravel())

In [None]:
# Number of nodes in the decision tree
print("Number of nodes in the decision tree: " + str(len(classifier.tree_.__getstate__()['nodes'])))

In [None]:
y_pred_class = classifier.predict(x_val_scaled_class)

In [None]:
conf_matrix = metrics.confusion_matrix(Y_val_class, y_pred_class)
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(Y_val_class, y_pred_class)
error = 1 - accuracy
precision = metrics.precision_score(Y_val_class, y_pred_class, average=None)
recall = metrics.recall_score(Y_val_class, y_pred_class, average=None)
F1_score = metrics.f1_score(Y_val_class, y_pred_class, average=None)
print("Results of Decision trees classifier\n")
print("Accuracy: " + str(accuracy) + "\n" + "Error: " + str(error) + "\n" + "Precision: " + str(precision) + "\n" + "Recall: " + str(recall) + "\n" + "F1 score: " + str(F1_score))

In [None]:
# task 4
# Naives Bayes Classifier
classifier = GaussianNB()  
classifier.fit(x_train_scaled_class, Y_train_class.values.ravel())

In [None]:
y_pred_class = classifier.predict(x_val_scaled_class)

In [None]:
conf_matrix = metrics.confusion_matrix(Y_val_class, y_pred_class)
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(Y_val_class, y_pred_class)
error = 1 - accuracy
precision = metrics.precision_score(Y_val_class, y_pred_class, average=None)
recall = metrics.recall_score(Y_val_class, y_pred_class, average=None)
F1_score = metrics.f1_score(Y_val_class, y_pred_class, average=None)
print("Results of Naive Bayes classifier\n")
print("Accuracy: " + str(accuracy) + "\n" + "Error: " + str(error) + "\n" + "Precision: " + str(precision) + "\n" + "Recall: " + str(recall) + "\n" + "F1 score: " + str(F1_score))

In [None]:
# SVM Classifier has the highest accuracy among all the 4 classification methods for the selected predictors.

In [None]:
# Task 5 Start Kmeans 2 clusters all vars

In [None]:
X = ['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born','Percent Female','Percent Age 29 and Under','Percent Age 65 and Older','Median Household Income','Percent Unemployed','Percent Less than High School Degree','Percent Less than Bachelor\'s Degree','Percent Rural']
Y = ['Party']

train_class = data[X]
lables = data[Y].to_numpy().reshape(-1)

scaler = StandardScaler()
scaler.fit(train_class)
train_scaled_class = scaler.transform(train_class)

clustering = KMeans(n_clusters=2, init='random', random_state=0).fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# DB Scan eps = 2 min_samples = 5 
clustering = DBSCAN(eps=2,min_samples=5,metric='euclidean').fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born'
# Kmeans init = 'k-means++' n init = 15
X = ['Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born']
Y = ['Party']


train_class = data[X]
lables = data[Y].to_numpy().reshape(-1)

scaler = StandardScaler()
scaler.fit(train_class)
train_scaled_class = scaler.transform(train_class)

clustering = KMeans(n_clusters=2, n_init= 15, init='k-means++', random_state=0).fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# DB Scan eps = 2 min samples = 8
clustering = DBSCAN(eps=2,min_samples=8,metric='euclidean').fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# 'Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born','Percent Female','Percent Age 29 and Under','Percent Age 65 and Older','Median Household Income','Percent Unemployed','Percent Less than High School Degree','Percent Less than Bachelor\'s Degree','Percent Rural'
# Kmeans init = 'k-means++' n init = 15
X = ['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born','Percent Female','Percent Age 29 and Under','Percent Age 65 and Older','Median Household Income','Percent Unemployed','Percent Less than High School Degree','Percent Less than Bachelor\'s Degree','Percent Rural']

train_class = data[X]
lables = data[Y].to_numpy().reshape(-1)

scaler = StandardScaler()
scaler.fit(train_class)
train_scaled_class = scaler.transform(train_class)

clustering = KMeans(n_clusters=2, n_init= 15, init='k-means++', random_state=0).fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# DB Scan eps =2 min samples =8
clustering = DBSCAN(eps=2,min_samples=8,metric='euclidean').fit(train_scaled_class)
clusters = clustering.labels_

cont_matrix = metrics.cluster.contingency_matrix(lables, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

adjusted_rand_index = metrics.adjusted_rand_score(lables, clusters)
silhouette_coefficient = np.average(metrics.silhouette_samples(train_scaled_class, clusters, metric='euclidean'))
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
# task 6
X = ['Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino', 'Percent Hispanic or Latino', 'Percent Age 29 and Under', 'Percent Age 65 and Older', 'Percent Less than High School Degree', 'Percent Less than Bachelor\'s Degree', 'Percent Rural']

_data = data[X]
_fips = data[['FIPS']]

scaler = StandardScaler()
scaler.fit(_data)
scaled_data = scaler.transform(_data)

party = svm_class.predict(scaled_data).tolist()
fips = _fips.values

colorscale = ["#ff0000","#0015bc"]
fig = create_choropleth(fips=fips, values=party, colorscale=colorscale, county_outline={'color': 'rgb(105,105,105)', 'width': 0.25}, state_outline={'color': 'rgb(192,192,192)','width': 1})
fig.layout.template = None
fig.show()

In [None]:
# task 7
test_data = pd.read_csv("demographics_test.csv")

scaler.fit(X_train)
x_test_scaled = scaler.transform(test_data[Xvariables])

model = linear_model.Lasso(alpha=1).fit(X=X_train_dummy, y=Y_train['Democratic'])
dem_predicted = model.predict(x_test_scaled)
dem_predicted = dem_predicted.astype(int)

model = linear_model.Lasso(alpha=1).fit(X=X_train_dummy, y=Y_train['Republican'])
rep_predicted = model.predict(x_test_scaled)
rep_predicted = rep_predicted.astype(int)

class_test = test_data.iloc[:, [4, 5, 6, 9, 10, 13, 14, 15]]
scaler.fit(class_test)
class_test_scaled = scaler.transform(class_test)

test_pred = svm_class.predict(class_test_scaled)

a = np.array([test_data['State'], test_data['County'], dem_predicted, rep_predicted, test_pred])
a = np.column_stack(a)
np.savetxt("output.csv", a, delimiter=',', header="State,County,Democratic,Republican,Party", fmt="%s", comments="")

# the output of this task is stored in the file 'output.csv'