In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

**Read and clean data if necessary**

In [2]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
train = pd.read_csv('/content/drive/My Drive/CAH-201803-train.csv')

In [4]:
test = pd.read_csv('/content/drive/My Drive/CAH-201803-test.csv')

In [5]:
train.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id_num                 169 non-null    int64 
 1   Q1                     169 non-null    object
 2   Q2                     169 non-null    int64 
 3   political_affiliation  169 non-null    object
 4   Q4                     169 non-null    object
 5   Q5                     169 non-null    object
 6   Q6                     169 non-null    object
 7   Q7                     169 non-null    object
 8   Q8                     169 non-null    object
 9   Q9                     169 non-null    object
 10  Q10                    169 non-null    object
 11  Q11                    169 non-null    object
 12  Q12                    169 non-null    object
 13  Q13                    169 non-null    object
 14  Q14                    169 non-null    object
 15  Q15                    

In [7]:
import pandas as pd

numeric_cols = train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train.select_dtypes(include=['object']).columns
categorical_cols = categorical_cols.drop('political_affiliation')
train_encoded = pd.get_dummies(train, columns=categorical_cols)

In [8]:
train_encoded.head()

Unnamed: 0,id_num,Q2,political_affiliation,Q15,Q16,Q17,Q1_Female,Q1_Male,Q4_Conservative,Q4_Liberal,...,Q11_Pro-life,Q12_No,Q12_Yes,Q13_No,Q13_Yes,Q14_Behave no differently,Q14_Less Willing,Q14_More Willing,Q18_No,Q18_Yes
0,1,53,Independent,5,2,5,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
1,5,66,Independent,4,5,4,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
2,7,58,Democrat,5,1,4,1,0,0,1,...,0,1,0,1,0,1,0,0,0,1
3,8,55,Independent,4,5,4,0,1,0,0,...,1,0,1,0,1,0,1,0,0,1
4,9,64,Republican,5,1,1,0,1,1,0,...,1,1,0,1,0,1,0,0,0,1


**Find feature importance**

In [9]:
X = train_encoded.drop(['political_affiliation'], axis = 1)
y = train_encoded['political_affiliation']

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Fit RandomForestClassifier to get feature importances
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('dummify', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

X_transformed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()
X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

forest = RandomForestClassifier(random_state=0)
forest.fit(X_transformed, y)

importances = forest.feature_importances_

sorted_indices = np.argsort(importances)[::-1]
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {importances[idx]}")


remainder__id_num: 0.1017456736272256
remainder__Q2: 0.09730778782100585
remainder__Q4_Conservative: 0.09382033317546175
remainder__Q17: 0.05189423006059767
remainder__Q16: 0.047929537820389896
remainder__Q4_Moderate: 0.035788291922904676
remainder__Q15: 0.03332429478981204
remainder__Q4_Liberal: 0.03171863631221452
remainder__Q6_White: 0.030905085957113346
remainder__Q18_Yes: 0.02784650933373609
remainder__Q18_No: 0.022560689808204214
remainder__Q1_Male: 0.022081134301157348
remainder__Q5_High school or less: 0.02177323908889098
remainder__Q10_Yes, somewhat religious: 0.021389043559218997
remainder__Q1_Female: 0.020713243280004574
remainder__Q7_No: 0.019350542406509824
remainder__Q8_Yes: 0.01906049266508904
remainder__Q13_Yes: 0.018495861376508684
remainder__Q6_Black: 0.01822833177603189
remainder__Q9_No: 0.018009691552747738
remainder__Q5_Some college: 0.017211196157959418
remainder__Q11_Pro-life: 0.01715509319174706
remainder__Q8_No: 0.01661311157467055
remainder__Q7_Yes: 0.01656968

**Try Decision Tree**

In [43]:
X = train.drop(['political_affiliation'], axis = 1)
y = train['political_affiliation']

In [12]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
         make_column_selector(dtype_include=object)),

    ],
    remainder="drop"
)
dt_pipeline = Pipeline([
    ("preprocessing", ct),
    ("decision_tree", DecisionTreeClassifier())
])

In [13]:
leaf_values = [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
leaf_param = {
    'decision_tree__min_samples_leaf': leaf_values}

gscv = GridSearchCV(dt_pipeline, leaf_param, cv=10, scoring='accuracy')
gscv.fit(X, y)
acc_scores = gscv.cv_results_['mean_test_score']

best_accuracy = np.max(gscv.cv_results_['mean_test_score'])
best_min_samples_leaf = gscv.best_params_['decision_tree__min_samples_leaf']
print("Best Accuracy Score:", best_accuracy)
print("Corresponding min_samples_leaf:", best_min_samples_leaf)

Best Accuracy Score: 0.580514705882353
Corresponding min_samples_leaf: 22


In [47]:
dt_pipeline = Pipeline([
    ("preprocessing", ct),
    ("decision_tree", DecisionTreeClassifier(min_samples_leaf=22))
])

depth_values = [1,2,3,4,5,6,7,8,9,10]
depth_param = {
    'decision_tree__max_depth': depth_values
}

gscv = GridSearchCV(dt_pipeline, depth_param, cv=10, scoring='accuracy')
gscv.fit(X, y)
acc_scores = gscv.cv_results_['mean_test_score']
best_accuracy = np.max(acc_scores)
best_max_depth = gscv.best_params_['decision_tree__max_depth']

print("Best Accuracy Score:", best_accuracy)
print("Corresponding max_depth:", best_max_depth)

Best Accuracy Score: 0.5863970588235294
Corresponding max_depth: 3


In [48]:
dt_pipeline = Pipeline([
    ("preprocessing", ct),
    ("decision_tree", DecisionTreeClassifier(max_depth = 3, min_samples_leaf=22))
])

dt_pipeline_fitted = dt_pipeline.fit(X,y)
dt_accuracy = cross_val_score(dt_pipeline_fitted, X, y, cv=10, scoring='accuracy').mean()
print(f"DT Accuracy: {dt_accuracy}")

DT Accuracy: 0.5863970588235294


In [16]:
final_predictions = pd.DataFrame(
    {"id_num": test['id_num'],
    "political_affiliation_predicted": dt_pipeline_fitted.predict(test)}
)

Decision Tree resulted in 0.45783 accuracy for the final predictions

**Try SVM**

In [71]:
X = train.drop(['political_affiliation', 'Q14', 'Q6'], axis = 1)
y = train['political_affiliation']

In [72]:
svm_pipeline = Pipeline([
    ("preprocessing", ct),
    ("svm", SVC(kernel = 'poly'))])

c_values = [0.155,0.156,0.154]
c_param = {'svm__C': c_values}
gscv = GridSearchCV(svm_pipeline, c_param, cv = 10, scoring='accuracy')

gscv_fitted = gscv.fit(X,y)
best_accuracy = np.max(gscv_fitted.cv_results_['mean_test_score'])
best_c_value = gscv_fitted.best_params_['svm__C']
print("Best Accuracy Score:", best_accuracy)
print("Corresponding C-Value:", best_c_value)

Best Accuracy Score: 0.5981617647058823
Corresponding C-Value: 0.155


In [73]:
svm_pipeline = Pipeline([
    ("preprocessing", ct),
    ("svm", SVC(C=.1, kernel = 'poly'))])
svm_pipeline_fitted = svm_pipeline.fit(X,y)
svm_accuracy = cross_val_score(svm_pipeline_fitted, X, y, cv=10, scoring='accuracy').mean()
print(f"SVM Accuracy: {svm_accuracy}")

SVM Accuracy: 0.6040441176470588


SVM resulted in 0.53012 accuracy for the final predictions

**Test set**

In [19]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [70]:
from sklearn.metrics import accuracy_score

svm_pipeline_fitted = svm_pipeline.fit(X_train, y_train)
predictions = svm_pipeline_fitted.predict(X_test)
test_accuracy = accuracy_score(y_test, predictions)
print(f"SVM Accuracy on Test Data: {test_accuracy}")

SVM Accuracy on Test Data: 0.5


In [49]:
from sklearn.metrics import accuracy_score

dt_pipeline_fitted = dt_pipeline.fit(X_train, y_train)
predictions = dt_pipeline_fitted.predict(X_test)
test_accuracy = accuracy_score(y_test, predictions)
print(f"Decision Tree Accuracy on Test Data: {test_accuracy}")

Decision Tree Accuracy on Test Data: 0.5


In [65]:
final_predictions_svm = pd.DataFrame(
    {"id_num": test['id_num'],
    "political_affiliation_predicted": svm_pipeline_fitted.predict(test)}
)

In [74]:
matches = final_predictions['political_affiliation_predicted'] == final_predictions_svm['political_affiliation_predicted']

num_matches = matches.sum()

print(f"Number of matching 'political_affiliation_predicted' values: {num_matches}")


Number of matching 'political_affiliation_predicted' values: 123


In [66]:
final_predictions_svm.to_csv('/content/drive/My Drive/final_predictions.csv', index=False)