final_predictions = pd.DataFrame(
    {"id_num": test_data['id_num'],
    "political_affiliation_predicted": final_model_fit.predict(test_data)}
)

In [26]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [3]:
df_pol = pd.read_csv('./Data/classification_train.csv')
df_pol.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [17]:
pipeline_num = Pipeline(
    [("impute", SimpleImputer(strategy='mean')),
     ("scale", StandardScaler())]
)

pipeline_cat = Pipeline(
    [("impute", SimpleImputer(strategy='most_frequent')),
     ("enc", OneHotEncoder(handle_unknown='ignore', sparse_output = False))]
)

ct = ColumnTransformer(
    [("num", pipeline_num, make_column_selector(dtype_include=np.number)),
     ("cat", pipeline_cat, make_column_selector(dtype_include='object'))]
)

pipeline_log = Pipeline([
    ("preprocessor", ct),
    ("log", LogisticRegression(multi_class='multinomial', max_iter=1000))
])
params_log = {
    'log__C': [0.001, 0.01, 0.1, 1, 10],
    'log__solver': ['lbfgs', 'sag', 'newton-cg']
}

pipeline_knn = Pipeline([
    ("preprocessor", ct),
    ("knn", KNeighborsClassifier())
])
params_knn = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

pipeline_forest = Pipeline(
    [("preprocessing",ct),
    ("forest", RandomForestClassifier())
])
params_forest = {
    'forest__n_estimators': [100, 200, 300],
    'forest__max_depth': [None, 10, 20, 30],
    'forest__min_samples_split': [2, 5, 10],
    'forest__min_samples_leaf': [1, 2, 4],
    'forest__max_features': ['auto', 'sqrt']
}

pipeline_lda = Pipeline(
     [("preprocessing", ct),
      ("lda", LinearDiscriminantAnalysis())]
)
params_lda = {
    "lda__solver": ['svd', 'lsqr', 'eigen'],
    "lda__shrinkage": [None, 'auto', 0.01, 0.1, 0.5, 0.9, 1]
}

pipeline_qda = Pipeline(
     [("preprocessing", ct),
      ("qda", QuadraticDiscriminantAnalysis())]
)
params_qda = {
    "qda__reg_param": [0, 0.001, 0.01, 0.1, 0.5, 0.9, 1]
}

pipeline_svc = Pipeline([
    ("preprocessor", ct),
    ("svc", SVC(decision_function_shape='ovo'))
])
params_svc = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'poly', 'sigmoid']
}

pipeline_svm = Pipeline([
    ("preprocessing", ct),
    ("svm", SVC(kernel='poly'))
])
params_svm = {
    "svm__C": [0.1, 1, 10, 100],
    "svm__degree": [2, 3, 4],
    "svm__gamma": ['scale', 'auto']
}

pipeline_xgb = Pipeline([
    ("preprocessor", ct),
    ("xgb", XGBClassifier(objective='multi:softprob'))  # or 'multi:softmax'
])
params_xgb = {
    'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'xgb__max_depth': [3, 4, 5, 6, 7],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__subsample': [0.7, 0.8, 0.9]
}

pipelines = {
    "Logistic Regression": pipeline_log,
    "kNN": pipeline_knn,
    "Forest": pipeline_forest,
    "LDA": pipeline_lda,
    "QDA": pipeline_qda,
    "SVC": pipeline_svc,
    "SVM": pipeline_svm,
    "XGBoost": pipeline_xgb
}
params = {
    "Logistic Regression": params_log,
    "kNN": params_knn,
    "Forest": params_forest,
    "LDA": params_lda,
    "QDA": params_qda,
    "SVC": params_svc,
    "SVM": params_svm,
    "XGBoost": params_xgb
}

In [5]:
df_pol.columns

Index(['id_num', 'Q1', 'Q2', 'political_affiliation', 'Q4', 'Q5', 'Q6', 'Q7',
       'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17',
       'Q18'],
      dtype='object')

In [7]:
X = df_pol.drop(columns=['political_affiliation', 'id_num'])
y = df_pol['political_affiliation']

In [8]:
dropped_features = {"all": X}

for column in X.columns:
    dropped_features[column] = X.drop(columns=[column])

In [21]:
def best_features(pipeline, params, features, y):
    scores_dict = {}

    for set_num, X in features.items():
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
        grid_search.fit(X, y)
        scores_dict[set_num] = grid_search.best_score_
    
    return sorted(scores_dict.items(), key=lambda x: x[1])

In [22]:
best_features(pipeline_lda, params_lda, dropped_features, y)

35 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Library/Frameworks/

[('Q4', 0.5919786096256685),
 ('Q6', 0.6040998217468806),
 ('Q17', 0.609982174688057),
 ('Q9', 0.6215686274509804),
 ('Q18', 0.62174688057041),
 ('Q2', 0.6221033868092691),
 ('Q7', 0.6274509803921569),
 ('Q8', 0.6274509803921569),
 ('Q11', 0.6274509803921569),
 ('Q13', 0.6274509803921569),
 ('Q10', 0.6276292335115865),
 ('Q14', 0.6276292335115865),
 ('Q1', 0.6331550802139037),
 ('Q12', 0.6333333333333333),
 ('all', 0.633511586452763),
 ('Q15', 0.633511586452763),
 ('Q16', 0.6450980392156863),
 ('Q5', 0.651336898395722)]

In [23]:
best_features(pipeline_svc, params_svc, dropped_features, y)

[('Q4', 0.574331550802139),
 ('Q6', 0.6040998217468806),
 ('Q18', 0.6040998217468806),
 ('Q1', 0.6153297682709448),
 ('Q11', 0.6155080213903743),
 ('Q12', 0.6155080213903743),
 ('Q9', 0.6213903743315508),
 ('Q7', 0.6215686274509804),
 ('Q10', 0.6217468805704099),
 ('all', 0.6270944741532977),
 ('Q14', 0.6272727272727272),
 ('Q5', 0.6274509803921569),
 ('Q15', 0.6274509803921569),
 ('Q17', 0.6276292335115865),
 ('Q8', 0.6329768270944742),
 ('Q2', 0.6336898395721926),
 ('Q16', 0.639572192513369),
 ('Q13', 0.6449197860962567)]

In [24]:
best_features(pipeline_svm, params_svm, dropped_features, y)

[('Q4', 0.5508021390374331),
 ('Q1', 0.5688057040998217),
 ('Q17', 0.5688057040998219),
 ('Q15', 0.580035650623886),
 ('all', 0.5857397504456328),
 ('Q6', 0.5857397504456328),
 ('Q2', 0.5862745098039215),
 ('Q18', 0.5864527629233512),
 ('Q13', 0.5914438502673797),
 ('Q12', 0.591800356506239),
 ('Q10', 0.5919786096256685),
 ('Q11', 0.5975044563279857),
 ('Q14', 0.5978609625668448),
 ('Q9', 0.5980392156862745),
 ('Q7', 0.6098039215686274),
 ('Q5', 0.615686274509804),
 ('Q8', 0.615686274509804),
 ('Q16', 0.616042780748663)]

In [25]:
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

best_features(pipeline_xgb, params_xgb, dropped_features, y_enc)

[('Q4', 0.544385026737968),
 ('Q6', 0.5686274509803921),
 ('Q10', 0.580392156862745),
 ('Q17', 0.580392156862745),
 ('Q9', 0.5980392156862745),
 ('Q14', 0.5980392156862745),
 ('Q5', 0.6040998217468806),
 ('Q12', 0.6040998217468806),
 ('Q18', 0.6040998217468806),
 ('all', 0.609982174688057),
 ('Q7', 0.609982174688057),
 ('Q15', 0.609982174688057),
 ('Q8', 0.6099821746880572),
 ('Q1', 0.6158645276292335),
 ('Q13', 0.616042780748663),
 ('Q16', 0.62174688057041),
 ('Q11', 0.633511586452763),
 ('Q2', 0.6566844919786096)]

In [29]:
X_subset = X[['Q4', 'Q6', 'Q17', 'Q9']]

for model_name, pipeline in pipelines.items():
    print(model_name)
    grid_search = GridSearchCV(pipeline, params[model_name], cv=5, scoring='accuracy')
    if model_name == "XGBoost":
        grid_search.fit(X_subset, y_enc)
    else:
        grid_search.fit(X_subset, y)
    
    mean_acc = grid_search.best_score_
    print(f"Mean accuracy for political party: {mean_acc}\n")

Logistic Regression
Mean accuracy for political party: 0.6046345811051694

kNN
Mean accuracy for political party: 0.6276292335115865

Forest
Mean accuracy for political party: 0.6518716577540108

LDA


35 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Library/Frameworks/

Mean accuracy for political party: 0.6342245989304813

QDA




Mean accuracy for political party: 0.616042780748663

SVC
Mean accuracy for political party: 0.6340463458110517

SVM
Mean accuracy for political party: 0.6340463458110517

XGBoost
Mean accuracy for political party: 0.6518716577540108



In [30]:
X_subset = X[['Q4', 'Q6', 'Q17', 'Q9', 'Q18', 'Q10']]

for model_name, pipeline in pipelines.items():
    print(model_name)
    grid_search = GridSearchCV(pipeline, params[model_name], cv=5, scoring='accuracy')
    if model_name == "XGBoost":
        grid_search.fit(X_subset, y_enc)
    else:
        grid_search.fit(X_subset, y)
    
    mean_acc = grid_search.best_score_
    print(f"Mean accuracy for political party: {mean_acc}\n")

Logistic Regression
Mean accuracy for political party: 0.6279857397504456

kNN
Mean accuracy for political party: 0.6044563279857398

Forest
Mean accuracy for political party: 0.6695187165775401

LDA


35 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Library/Frameworks/

Mean accuracy for political party: 0.6340463458110517

QDA




Mean accuracy for political party: 0.62174688057041

SVC
Mean accuracy for political party: 0.6458110516934047

SVM
Mean accuracy for political party: 0.6459893048128343

XGBoost
Mean accuracy for political party: 0.6636363636363637



In [32]:
X_subset = X[['Q4', 'Q6', 'Q17', 'Q9', 'Q18', 'Q10']]

X_train, X_test, y_train, y_test = train_test_split(X_subset, y_enc, test_size=0.2, random_state=42)

xgb = GridSearchCV(pipeline_xgb, params_xgb, cv=5, scoring='accuracy')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy between predicted values and actual values:", acc)

Accuracy between predicted values and actual values: 0.5294117647058824


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)

rf = GridSearchCV(pipeline_forest, params_forest, cv=5, scoring='accuracy')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy between predicted values and actual values:", acc)

Accuracy between predicted values and actual values: 0.5294117647058824


In [34]:
lda = GridSearchCV(pipeline_lda, params_lda, cv=5, scoring='accuracy')
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy between predicted values and actual values:", acc)

Accuracy between predicted values and actual values: 0.5


35 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Library/Frameworks/

In [38]:
X_subset = X[['Q4', 'Q6', 'Q17', 'Q9', 'Q18']]

X_train, X_test, y_train, y_test = train_test_split(X_subset, y_enc, test_size=0.2, random_state=42)

xgb = GridSearchCV(pipeline_xgb, params_xgb, cv=5, scoring='accuracy')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy between predicted values and actual values:", acc)

Accuracy between predicted values and actual values: 0.5588235294117647


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)

rf = GridSearchCV(pipeline_forest, params_forest, cv=5, scoring='accuracy')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print("Accuracy between predicted values and actual values:", acc)

Accuracy between predicted values and actual values: 0.5294117647058824


In [40]:
df_house_test = pd.read_csv('./Data/classification_test.csv')
df_house_test.head()

Unnamed: 0,id_num,Q1,Q2,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,2,Female,78,Conservative,College degree,White,Yes,Yes,No,"Yes, very religious",Pro-Choice,Yes,Yes,Behave no differently,4,5,1,Yes
1,3,Male,59,Moderate,High school or less,Black,Yes,Yes,Yes,"Yes, very religious",Pro-Choice,No,No,More Willing,5,4,5,No
2,4,Male,59,Moderate,High school or less,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,No,Behave no differently,4,5,1,Yes
3,6,Male,52,Moderate,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,No,Yes,Less Willing,5,4,4,No
4,11,Female,33,Moderate,High school or less,White,No,No,Yes,"Yes, somewhat religious",Pro-Choice,No,No,More Willing,5,5,4,Yes


In [46]:
test_data = df_house_test[['Q4', 'Q6', 'Q17', 'Q9', 'Q18']]

num_predictions = xgb.predict(test_data)
pred = label_encoder.inverse_transform(num_predictions)

result_df = pd.DataFrame(
    {"id_num": df_house_test['id_num'],
    "political_affiliation_predicted": pred}
)

result_df.head()

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Democrat
3,6,Independent
4,11,Independent


In [47]:
result_df.to_csv('classification.csv', index=False)