<a href="https://colab.research.google.com/github/munavarhs/USElectionsDataAnalysis/blob/main/USElectionsDataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import numpy as np
import pandas as pd
import string

#ML-models
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


#using sklearn model to split our testing and training data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import precision_score, recall_score, f1_score


import warnings

In [19]:
election_data = pd.read_csv('house-votes-84.data', header=None)

In [20]:
election_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [22]:
y = election_data[0]
X = election_data.iloc[:, 1:]

In [25]:
# Replace 'y' with 1, 'n' with 0, and '?' with -1
X = X.replace({'y': 1, 'n': 0, '?': -1})


In [26]:
#Now handle missing values in three ways:
#Remove rows with missing values
X_dropped = X.dropna()
y_dropped = y[X_dropped.index]

In [27]:
#Treat missing as a separate category (already done with -1)
X_filled = X

In [28]:
def replace_most_freq():
  freq = (lambda x: x.fillna(x.mode()[0]))
  return freq

In [29]:
X_imputed = X.apply(replace_most_freq())

In [30]:
# Perform 5-fold cross-validation
def evaluate_model(classifier, X, y):
    precision = cross_val_score(classifier, X, y, cv=5, scoring='precision_weighted').mean()
    recall = cross_val_score(classifier, X, y, cv=5, scoring='recall_weighted').mean()
    f1 = cross_val_score(classifier, X, y, cv=5, scoring='f1_weighted').mean()
    return precision, recall, f1

In [31]:
# Initialize the models
decision_tree_clf = DecisionTreeClassifier()
naive_bayes_clf = GaussianNB()

In [32]:
# 1. Decision Tree and Naive Bayes on data with dropped rows
print("Evaluating with dropped rows (Decision Tree):")
precision, recall, f1 = evaluate_model(decision_tree_clf, X_dropped, y_dropped)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

print("Evaluating with dropped rows (Naive Bayes):")
precision, recall, f1 = evaluate_model(naive_bayes_clf, X_dropped, y_dropped)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

# 2. Decision Tree and Naive Bayes on data with missing values treated as -1
print("Evaluating with missing values treated (Decision Tree):")
precision, recall, f1 = evaluate_model(decision_tree_clf, X_filled, y)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

print("Evaluating with missing values treated (Naive Bayes):")
precision, recall, f1 = evaluate_model(naive_bayes_clf, X_filled, y)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

# 3. Decision Tree and Naive Bayes on data with imputed missing values
print("Evaluating with imputed missing values (Decision Tree):")
precision, recall, f1 = evaluate_model(decision_tree_clf, X_imputed, y)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

print("Evaluating with imputed missing values (Naive Bayes):")
precision, recall, f1 = evaluate_model(decision_tree_clf, X_imputed, y)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

Evaluating with dropped rows (Decision Tree):
Precision: 0.95, Recall: 0.94, F1: 0.94
Evaluating with dropped rows (Naive Bayes):
Precision: 0.92, Recall: 0.92, F1: 0.92
Evaluating with missing values treated (Decision Tree):
Precision: 0.95, Recall: 0.94, F1: 0.94
Evaluating with missing values treated (Naive Bayes):
Precision: 0.92, Recall: 0.92, F1: 0.92
Evaluating with imputed missing values (Decision Tree):
Precision: 0.95, Recall: 0.94, F1: 0.94
Evaluating with imputed missing values (Naive Bayes):
Precision: 0.95, Recall: 0.95, F1: 0.94


In [33]:
# Example data (scores for 5 folds)
dt_scores = {
    'precision': [0.95, 0.95, 0.94],
    'recall': [0.95, 0.94, 0.94],
    'f1': [0.95, 0.94, 0.94]
}

nb_scores = {
    'precision': [0.92, 0.92, 0.95],
    'recall': [0.92, 0.92, 0.95],
    'f1': [0.92, 0.92, 0.94]
}


In [None]:
# Assuming you have precision, recall, and f1 scores stored in dt_scores and nb_scores dictionaries
metrics = ['precision', 'recall', 'f1']

dt_stats = {metric: (np.mean(dt_scores[metric]), np.std(dt_scores[metric])) for metric in metrics}
nb_stats = {metric: (np.mean(nb_scores[metric]), np.std(nb_scores[metric])) for metric in metrics}

# Displaying the results
print("Decision Tree Stats (Mean, Std):", dt_stats)
print("Naive Bayes Stats (Mean, Std):", nb_stats)



Decision Tree Stats (Mean, Std): {'precision': (0.9466666666666667, 0.004714045207910321), 'recall': (0.9433333333333334, 0.004714045207910321), 'f1': (0.9433333333333334, 0.004714045207910321)}
Naive Bayes Stats (Mean, Std): {'precision': (0.93, 0.01414213562373091), 'recall': (0.93, 0.01414213562373091), 'f1': (0.9266666666666667, 0.00942809041582059)}
