In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [46]:
train_df = pd.read_csv("cah-201803-train.csv")
test_df = pd.read_csv("cah-201803-test.csv")

In [47]:
train_df.isnull().sum()

id_num                   0
Q1                       0
Q2                       0
political_affiliation    0
Q4                       0
Q5                       0
Q6                       0
Q7                       0
Q8                       0
Q9                       0
Q10                      0
Q11                      0
Q12                      0
Q13                      0
Q14                      0
Q15                      0
Q16                      0
Q17                      0
Q18                      0
dtype: int64

In [48]:
train_df.columns.tolist()

['id_num',
 'Q1',
 'Q2',
 'political_affiliation',
 'Q4',
 'Q5',
 'Q6',
 'Q7',
 'Q8',
 'Q9',
 'Q10',
 'Q11',
 'Q12',
 'Q13',
 'Q14',
 'Q15',
 'Q16',
 'Q17',
 'Q18']

In [49]:
train_df.dtypes

id_num                    int64
Q1                       object
Q2                        int64
political_affiliation    object
Q4                       object
Q5                       object
Q6                       object
Q7                       object
Q8                       object
Q9                       object
Q10                      object
Q11                      object
Q12                      object
Q13                      object
Q14                      object
Q15                       int64
Q16                       int64
Q17                       int64
Q18                      object
dtype: object

In [50]:
X_train = train_df.drop(["political_affiliation", "id_num"], axis=1)
y_train = train_df["political_affiliation"]
X_test = test_df.drop(["id_num"], axis=1)

In [51]:
label_encoders = {}
categorical_cols = X_train.select_dtypes(include=["object"]).columns

combined_data = {col: pd.concat([X_train[col], X_test[col]], axis=0) 
                for col in categorical_cols}

label_encoders = {col: LabelEncoder().fit(combined_data[col].astype(str)) 
                 for col in categorical_cols}

X_train[categorical_cols] = pd.DataFrame({col: label_encoders[col].transform(X_train[col].astype(str)) 
                                          for col in categorical_cols}, index=X_train.index)

X_test[categorical_cols] = pd.DataFrame({col: label_encoders[col].transform(X_test[col].astype(str)) 
                                        for col in categorical_cols}, index=X_test.index)

In [52]:
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

In [53]:
y_encoder = LabelEncoder()
y_train_encoded = y_encoder.fit_transform(y_train)

In [54]:
pd.DataFrame({
    "class": y_encoder.classes_,
    "count": [np.sum(y_train == cls) for cls in y_encoder.classes_],
    "percentage": [np.sum(y_train == cls)/len(y_train)*100 for cls in y_encoder.classes_]
})

Unnamed: 0,class,count,percentage
0,Democrat,59,34.911243
1,Independent,56,33.136095
2,Republican,54,31.952663


In [55]:
X_train["feature_sum"] = X_train.sum(axis=1)
X_test["feature_sum"] = X_test.sum(axis=1)

X_train["feature_mean"] = X_train.mean(axis=1)
X_test["feature_mean"] = X_test.mean(axis=1)

In [56]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train_encoded, test_size=0.2, stratify=y_train_encoded
)

In [57]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

results = {}

In [58]:
lr_base = LogisticRegression(max_iter=1000)

param_grid_lr = {
    "C": [0.1, 1, 10, 50],
    "solver": ["liblinear", "lbfgs"]
}

lr_grid = GridSearchCV(estimator=lr_base, param_grid=param_grid_lr, cv=5, scoring="accuracy", verbose=0)
lr_grid.fit(X_tr_scaled, y_tr)

lr_model = lr_grid.best_estimator_
lr_pred = lr_model.predict(X_val_scaled)
lr_acc = accuracy_score(y_val, lr_pred)

results["Logistic Regression"] = {
    "model": lr_model,
    "accuracy": lr_acc,
    "predictions": lr_pred,
    "scaled": True
}

classification_report(y_val, lr_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.67      0.67      0.67        12\n Independent       0.64      0.64      0.64        11\n  Republican       0.64      0.64      0.64        11\n\n    accuracy                           0.65        34\n   macro avg       0.65      0.65      0.65        34\nweighted avg       0.65      0.65      0.65        34\n'

In [59]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_tr_scaled, y_tr)
lda_pred = lda_model.predict(X_val_scaled)
lda_acc = accuracy_score(y_val, lda_pred)

results["LDA"] = {
    "model": lda_model,
    "accuracy": lda_acc,
    "predictions": lda_pred,
    "scaled": True
}

classification_report(y_val, lda_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.60      0.50      0.55        12\n Independent       0.53      0.73      0.62        11\n  Republican       0.78      0.64      0.70        11\n\n    accuracy                           0.62        34\n   macro avg       0.64      0.62      0.62        34\nweighted avg       0.64      0.62      0.62        34\n'

In [60]:
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=10)
dt_model.fit(X_tr, y_tr)
dt_pred = dt_model.predict(X_val)
dt_acc = accuracy_score(y_val, dt_pred)

results["Decision Tree"] = {
    "model": dt_model,
    "accuracy": dt_acc,
    "predictions": dt_pred,
    "scaled": False
}

classification_report(y_val, dt_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.50      0.58      0.54        12\n Independent       0.47      0.64      0.54        11\n  Republican       1.00      0.45      0.62        11\n\n    accuracy                           0.56        34\n   macro avg       0.66      0.56      0.57        34\nweighted avg       0.65      0.56      0.57        34\n'

In [61]:
rf_simple = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=15,
    min_samples_leaf=5,
    max_features="sqrt"
)
rf_simple.fit(X_tr, y_tr)
rf_simple_pred = rf_simple.predict(X_val)
rf_simple_acc = accuracy_score(y_val, rf_simple_pred)

results["Simple Random Forest"] = {
    "model": rf_simple,
    "accuracy": rf_simple_acc,
    "predictions": rf_simple_pred,
    "scaled": False
}

classification_report(y_val, rf_simple_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.67      0.67      0.67        12\n Independent       0.69      0.82      0.75        11\n  Republican       0.78      0.64      0.70        11\n\n    accuracy                           0.71        34\n   macro avg       0.71      0.71      0.71        34\nweighted avg       0.71      0.71      0.70        34\n'

In [62]:
gb_base = GradientBoostingClassifier()

param_grid_gb = {
    "n_estimators": [50, 100, 150, 200],
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [3, 5]
}

gb_grid = GridSearchCV(estimator=gb_base, param_grid=param_grid_gb, cv=5, scoring="accuracy", verbose=0)
gb_grid.fit(X_tr, y_tr)

gb_model = gb_grid.best_estimator_
gb_pred = gb_model.predict(X_val)
gb_acc = accuracy_score(y_val, gb_pred)

results["Gradient Boosting"] = {
    "model": gb_model,
    "accuracy": gb_acc,
    "predictions": gb_pred,
    "scaled": False
}

classification_report(y_val, gb_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.50      0.42      0.45        12\n Independent       0.40      0.55      0.46        11\n  Republican       0.67      0.55      0.60        11\n\n    accuracy                           0.50        34\n   macro avg       0.52      0.50      0.51        34\nweighted avg       0.52      0.50      0.50        34\n'

In [63]:
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X_tr_scaled, y_tr)
knn_pred = knn_model.predict(X_val_scaled)
knn_acc = accuracy_score(y_val, knn_pred)

results["KNN"] = {
    "model": knn_model,
    "accuracy": knn_acc,
    "predictions": knn_pred,
    "scaled": True
}

classification_report(y_val, knn_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.36      0.33      0.35        12\n Independent       0.38      0.45      0.42        11\n  Republican       0.50      0.45      0.48        11\n\n    accuracy                           0.41        34\n   macro avg       0.42      0.41      0.41        34\nweighted avg       0.41      0.41      0.41        34\n'

In [64]:
nb_model = GaussianNB()
nb_model.fit(X_tr_scaled, y_tr)
nb_pred = nb_model.predict(X_val_scaled)
nb_acc = accuracy_score(y_val, nb_pred)

nb_model = GaussianNB()
nb_model.fit(X_tr_scaled, y_tr)
nb_pred = nb_model.predict(X_val_scaled)
nb_acc = accuracy_score(y_val, nb_pred)

results["Naive Bayes"] = {
    "model": nb_model,
    "accuracy": nb_acc,
    "predictions": nb_pred,
    "scaled": True
}

classification_report(y_val, nb_pred, target_names=y_encoder.classes_)

'              precision    recall  f1-score   support\n\n    Democrat       0.47      0.58      0.52        12\n Independent       0.71      0.45      0.56        11\n  Republican       0.58      0.64      0.61        11\n\n    accuracy                           0.56        34\n   macro avg       0.59      0.56      0.56        34\nweighted avg       0.58      0.56      0.56        34\n'

In [65]:
best_model_name = max(results.items(), key=lambda x: x[1]["accuracy"])[0]
best_model_info = results[best_model_name]
best_model = best_model_info["model"]

pd.DataFrame({
    "metric": ["Best Model", "Validation Accuracy"],
    "value": [best_model_name, f"{best_model_info["accuracy"]:.4f}"]
})

Unnamed: 0,metric,value
0,Best Model,Simple Random Forest
1,Validation Accuracy,0.7059


In [66]:
comparison_df = pd.DataFrame([{"Model": name, "Validation Accuracy": info["accuracy"]} for name, info in results.items()])
comparison_df = comparison_df.sort_values(by="Validation Accuracy", ascending=False).reset_index(drop=True)
comparison_df

Unnamed: 0,Model,Validation Accuracy
0,Simple Random Forest,0.705882
1,Logistic Regression,0.647059
2,LDA,0.617647
3,Decision Tree,0.558824
4,Naive Bayes,0.558824
5,Gradient Boosting,0.5
6,KNN,0.411765


In [67]:
if best_model_info["scaled"]:
    best_model.fit(X_train_scaled, y_train_encoded)
    test_predictions_encoded = best_model.predict(X_test_scaled)
    test_proba = best_model.predict_proba(X_test_scaled)
else:
    best_model.fit(X_train, y_train_encoded)
    test_predictions_encoded = best_model.predict(X_test)
    test_proba = best_model.predict_proba(X_test)

In [68]:
test_predictions = y_encoder.inverse_transform(test_predictions_encoded)

In [69]:
submission = pd.DataFrame({
    "id_num": test_df["id_num"],
    "political_affiliation_predictions": test_predictions
})

submission.to_csv("predictions.csv", index=False)

In [70]:
pred_counts = pd.Series(test_predictions).value_counts()
pd.DataFrame({
    "class": pred_counts.index,
    "count": pred_counts.values,
    "percentage": pred_counts.values / len(test_predictions) * 100
})

Unnamed: 0,class,count,percentage
0,Independent,68,40.963855
1,Democrat,55,33.13253
2,Republican,43,25.903614


In [71]:
cm = confusion_matrix(y_val, best_model_info["predictions"])
cm

array([[8, 3, 1],
       [1, 9, 1],
       [3, 1, 7]])

In [72]:
metrics = {
    "model_name": best_model_name,
    "validation_accuracy": best_model_info["accuracy"],
    "confusion_matrix": cm,
    "class_names": y_encoder.classes_,
    "all_model_results": comparison_df,
    "feature_importance": feature_importance if best_model_name == "Random Forest" else None
}

In [73]:
import os
if not os.path.exists("dashboard_plots"):
    os.makedirs("dashboard_plots")


In [74]:
best_model = best_model_name
best_accuracy = best_model_info["accuracy"] * 100
train_samples = len(train_df)
test_samples = len(test_df)
num_features = X_train.shape[1]

In [75]:
class_dist = train_df["political_affiliation"].value_counts()
class_distribution = (class_dist / len(train_df) * 100).round(1).to_dict()

In [76]:
model_accuracies = {row["Model"]: row["Validation Accuracy"]*100
                    for _, row in comparison_df.iterrows()}

model_acc_df = pd.DataFrame([
    {"model": model, "accuracy": acc, "is_best": model == best_model}
    for model, acc in sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)
])
model_acc_df

Unnamed: 0,model,accuracy,is_best
0,Simple Random Forest,70.588235,True
1,Logistic Regression,64.705882,False
2,LDA,61.764706,False
3,Decision Tree,55.882353,False
4,Naive Bayes,55.882353,False
5,Gradient Boosting,50.0,False
6,KNN,41.176471,False


In [77]:
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    feature_imp_top5 = feature_importance.head(5)
    feature_importance = dict(zip(feature_imp_top5["feature"],
                                 feature_imp_top5["importance"]))
    pd.DataFrame(list(feature_importance.items()), columns=["feature", "importance"])
else:
    feature_importance = {f"Feature_{i}": 0.1 for i in range(1, 6)}
    pd.DataFrame(list(feature_importance.items()), columns=["feature", "importance"])

In [78]:
max_proba = np.max(test_proba, axis=1)

In [79]:
confidence_bins = {
    "0-20%": int(np.sum((max_proba >= 0.0) & (max_proba < 0.2))),
    "20-40%": int(np.sum((max_proba >= 0.2) & (max_proba < 0.4))),
    "40-60%": int(np.sum((max_proba >= 0.4) & (max_proba < 0.6))),
    "60-80%": int(np.sum((max_proba >= 0.6) & (max_proba < 0.8))),
    "80-100%": int(np.sum((max_proba >= 0.8) & (max_proba <= 1.0)))
}

pd.DataFrame([
    {"confidence_range": bin_name, "count": count, "percentage": count / len(max_proba) * 100}
    for bin_name, count in confidence_bins.items()
])

Unnamed: 0,confidence_range,count,percentage
0,0-20%,0,0.0
1,20-40%,11,6.626506
2,40-60%,122,73.493976
3,60-80%,32,19.277108
4,80-100%,1,0.60241


In [80]:
fig, ax = plt.subplots(figsize=(10, 6))

models = list(model_accuracies.keys())
accuracies = list(model_accuracies.values())

sorted_indices = np.argsort(accuracies)[::-1]
models_sorted = [models[i] for i in sorted_indices]
accuracies_sorted = [accuracies[i] for i in sorted_indices]

colors = ["#10B981" if model == best_model else "#4B4BA2" for model in models_sorted]

bars = ax.barh(models_sorted, accuracies_sorted, color=colors)
ax.set_xlabel("Validation Accuracy (%)", fontsize=12, fontweight="bold")
ax.set_title("Model Performance Comparison", fontsize=14, fontweight="bold")
ax.set_xlim(0, 100)

[ax.text(acc + 1, i, f"{acc:.1f}%", va="center", fontweight="bold") for i, acc in enumerate(accuracies_sorted)]

plt.tight_layout()
plt.savefig("dashboard_plots/model_comparison.png", dpi=300, bbox_inches="tight")
plt.close()

In [81]:
models = list(model_accuracies.keys())
accuracies = list(model_accuracies.values())

sorted_indices = np.argsort(accuracies)[::-1]
models_sorted_full = [models[i] for i in sorted_indices]
accuracies_sorted_full = [accuracies[i] for i in sorted_indices]

top_n = 3
models_sorted = models_sorted_full[:top_n]
accuracies_sorted = accuracies_sorted_full[:top_n]

fig, ax = plt.subplots(figsize=(10, 4))

colors = ["#10B981" if model == best_model else "#4B4BA2" for model in models_sorted]

bars = ax.barh(models_sorted, accuracies_sorted, color=colors)
ax.set_xlabel("Validation Accuracy (%)", fontsize=12, fontweight="bold")
ax.set_title(f"Top {top_n} Model Performance Comparison", fontsize=14, fontweight="bold")
ax.set_xlim(0, 100)

[ax.text(acc + 1, i, f"{acc:.1f}%", va="center", fontweight="bold") for i, acc in enumerate(accuracies_sorted)]

ax.invert_yaxis()

plt.tight_layout()
plt.savefig("dashboard_plots/model_comparison.png", dpi=300, bbox_inches="tight")
plt.close()

In [82]:
fig, ax = plt.subplots(figsize=(8, 8))

classes = list(class_distribution.keys())
percentages = list(class_distribution.values())

colors = ["#3B82F6", "#EF4444", "#A855F7"]
explode = [0.05] * len(classes)

wedges, texts, autotexts = ax.pie(percentages,
                                    labels=classes,
                                    autopct="%1.1f%%",
                                    startangle=90,
                                    colors=colors,
                                    explode=explode,
                                    textprops={"fontsize": 12, "fontweight": "bold"})

ax.set_title("Target Class Distribution\n(Training Data)",
             fontsize=14, fontweight="bold", pad=20)

plt.tight_layout()
plt.savefig("dashboard_plots/class_distribution.png", dpi=300, bbox_inches="tight")
plt.close()

In [83]:
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    fig, ax = plt.subplots(figsize=(10, 6))

    features = list(feature_importance.keys())
    importances = list(feature_importance.values())

    bars = ax.barh(features, importances, color="#664E9E")
    ax.set_xlabel("Importance Score", fontsize=12, fontweight="bold")
    ax.set_title(f"Top 5 Feature Importances\n({best_model_name})",
                 fontsize=14, fontweight="bold")

    for i, (bar, imp) in enumerate(zip(bars, importances)):
        ax.text(imp + 0.002, i, f"{imp:.4f}", va="center", fontweight="bold")

    plt.tight_layout()
    plt.savefig("dashboard_plots/feature_importance.png", dpi=300, bbox_inches="tight")
    plt.close()
else:
    feature_importance = {f"Feature_{i}": 0.1 for i in range(1, 6)}

In [84]:
fig, ax = plt.subplots(figsize=(10, 6))

bins = list(confidence_bins.keys())
counts = list(confidence_bins.values())

ax.plot(bins, counts, marker="o", linewidth=2, markersize=10, color="#4B4BA2")
ax.fill_between(range(len(bins)), counts, alpha=0.3, color="#4B4BA2")
ax.set_xlabel("Confidence Range", fontsize=12, fontweight="bold")
ax.set_ylabel("Number of Predictions", fontsize=12, fontweight="bold")
ax.set_title("Prediction Confidence Distribution", fontsize=14, fontweight="bold")
ax.grid(True, alpha=0.3)

[ax.text(i, count + 2, str(count), ha="center", fontweight="bold") for i, count in enumerate(counts)]

plt.tight_layout()
plt.savefig("dashboard_plots/confidence_distribution.png", dpi=300, bbox_inches="tight")
plt.close()

In [85]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=y_encoder.classes_, yticklabels=y_encoder.classes_,
            cbar_kws={"label": "Count"}, ax=ax)

ax.set_xlabel("Predicted Label", fontsize=12, fontweight="bold")
ax.set_ylabel("True Label", fontsize=12, fontweight="bold")
ax.set_title(f"Confusion Matrix\n({best_model_name})",
             fontsize=14, fontweight="bold")

plt.tight_layout()
plt.savefig("dashboard_plots/confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.close()

In [86]:
dashboard_summary = pd.DataFrame({
    "Metric": ["Best Model", "Validation Accuracy", "Training Samples",
               "Test Samples", "Number of Features", "Mean Confidence"],
    "Value": [best_model, f"{best_accuracy:.1f}%", train_samples,
              test_samples, num_features, f"{np.mean(max_proba):.3f}"]
})

dashboard_summary

Unnamed: 0,Metric,Value
0,Best Model,Simple Random Forest
1,Validation Accuracy,70.6%
2,Training Samples,169
3,Test Samples,166
4,Number of Features,19
5,Mean Confidence,0.536


In [87]:
comparison_df["Accuracy_Percentage"] = comparison_df["Validation Accuracy"] * 100
comparison_df

Unnamed: 0,Model,Validation Accuracy,Accuracy_Percentage
0,Simple Random Forest,0.705882,70.588235
1,Logistic Regression,0.647059,64.705882
2,LDA,0.617647,61.764706
3,Decision Tree,0.558824,55.882353
4,Naive Bayes,0.558824,55.882353
5,Gradient Boosting,0.5,50.0
6,KNN,0.411765,41.176471


In [88]:
summary_df = pd.DataFrame({
    "Metric": ["Best Model", "Validation Accuracy (%)", "Training Samples", 
               "Test Samples", "Features Used", "Mean Prediction Confidence"],
    "Value": [best_model, f"{best_accuracy:.1f}", train_samples, 
              test_samples, num_features, f"{np.mean(max_proba):.3f}"]
})

top5_models = comparison_df.head(5).copy()
top5_models["Accuracy_Percentage"] = (top5_models["Validation Accuracy"] * 100).round(1)

class_dist_df = pd.DataFrame([
    {"Class": cls, "Percentage": pct}
    for cls, pct in sorted(class_distribution.items(), key=lambda x: x[1], reverse=True)
])

feature_imp_df = pd.DataFrame([
    {"Feature": feat, "Importance": imp}
    for feat, imp in list(feature_importance.items())[:5]
])

summary_df, top5_models, class_dist_df, feature_imp_df

(                       Metric                 Value
 0                  Best Model  Simple Random Forest
 1     Validation Accuracy (%)                  70.6
 2            Training Samples                   169
 3                Test Samples                   166
 4               Features Used                    19
 5  Mean Prediction Confidence                 0.536,
                   Model  Validation Accuracy  Accuracy_Percentage
 0  Simple Random Forest             0.705882                 70.6
 1   Logistic Regression             0.647059                 64.7
 2                   LDA             0.617647                 61.8
 3         Decision Tree             0.558824                 55.9
 4           Naive Bayes             0.558824                 55.9,
          Class  Percentage
 0     Democrat        34.9
 1  Independent        33.1
 2   Republican        32.0,
      Feature  Importance
 0  Feature_1         0.1
 1  Feature_2         0.1
 2  Feature_3         0.1
 3  Feat