In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

In [26]:
import os

# Changing directory to project root
os.chdir(r'C:\Users\Oscar\Documents\Workspace\ml-deployment-platform')

In [27]:
train = pd.read_csv(r'data\CAH-201803-train.csv')
test = pd.read_csv(r'data\CAH-201803-test.csv')

In [10]:
# Define X and y
X = train.drop(columns=['id_num', 'political_affiliation'])
y = train['political_affiliation']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CT
ct = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown = "ignore"), make_column_selector(dtype_include=object)),
     ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
])

In [11]:
# Logistic Regression Pipeline
pipeline_lr = Pipeline(steps=[
    ('preprocessing', ct),
    ('model', LogisticRegression())
])

# Fit the pipeline
pipeline_lr.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = pipeline_lr.predict(X_val)

# Evaluate model
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.56


In [12]:
# LDA Pipeline
pipeline_lda = Pipeline(steps=[
    ('preprocessing', ct),
    ('model', LinearDiscriminantAnalysis())
])

# Fit the pipeline
pipeline_lda.fit(X_train, y_train)

# Perform cross-validation
scores = cross_val_score(pipeline_lda, X, y, cv=5, scoring="accuracy")

# Calculate and display cross-validated accuracy
cv_accuracy = scores.mean()
print("Cross-Validated Accuracy for LDA Model:", cv_accuracy)

Cross-Validated Accuracy for LDA Model: 0.5980392156862744


In [13]:
# KNN
pipeline_knn = Pipeline(steps=[
    ('preprocessing', ct),  # Preprocessing with ColumnTransformer
    ('model', KNeighborsClassifier(n_neighbors=5))
])

# Perform cross-validation
scores_knn = cross_val_score(pipeline_knn, X, y, cv=5, scoring="accuracy")

# Calculate cross-validated accuracy
cv_accuracy_knn = scores_knn.mean()
print("Cross-Validated Accuracy for KNN Model:", cv_accuracy_knn)

Cross-Validated Accuracy for KNN Model: 0.4973262032085562


In [15]:
# SVM
# Define hyperparameter grid
param_grid = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf', 'poly'],
    'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

# Define SVM pipeline
pipeline_svm = Pipeline(steps=[
    ('preprocessing', ct),
    ('model', SVC(random_state=42))
])

# Perform GridSearchCV
grid_search_svm = GridSearchCV(pipeline_svm, param_grid, cv=5, scoring="accuracy", verbose=1)
grid_search_svm.fit(X, y)

# Best parameters and accuracy
best_params = grid_search_svm.best_params_
best_score = grid_search_svm.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validated Accuracy:", best_score)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.6212121212121212


In [16]:
# Fit the best model on the full training dataset
final_model_fit = grid_search_svm.best_estimator_
final_model_fit.fit(X, y)

final_predictions = pd.DataFrame({
    "id_num": test['id_num'],
    "political_affiliation_predicted": final_model_fit.predict(test.drop(columns=['id_num']))
})
final_predictions.to_csv("final_predictions.csv", index=False)

In [17]:
pipeline_poly_svm = Pipeline(steps=[
    ('preprocessing', ct),  # Preprocessing step
    ('model', SVC(kernel='poly', random_state=42))
])

# Define hyperparameter grid for polynomial kernel
param_grid_poly = {
    'model__C': [0.001 ,0.01, 0.1, 1, 10],
    'model__degree': [2, 3, 4],
    'model__gamma': ['scale', 'auto'],
    'model__coef0': [0, 1, 10]
}

# Perform GridSearchCV
grid_search_poly = GridSearchCV(pipeline_poly_svm, param_grid_poly, cv=5, scoring="accuracy", verbose=1)
grid_search_poly.fit(X, y)

# Best parameters and accuracy
best_params_poly = grid_search_poly.best_params_
best_score_poly = grid_search_poly.best_score_

print("Best Parameters for Polynomial Kernel SVM:", best_params_poly)
print("Best Cross-Validated Accuracy for Polynomial Kernel SVM:", best_score_poly)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Parameters for Polynomial Kernel SVM: {'model__C': 0.01, 'model__coef0': 10, 'model__degree': 3, 'model__gamma': 'auto'}
Best Cross-Validated Accuracy for Polynomial Kernel SVM: 0.6213903743315508


In [18]:
# Fit best polynomial SVM model on training set
final_model_fit_poly = grid_search_poly.best_estimator_
final_model_fit_poly.fit(X, y)

X_test = test.drop(columns=['id_num'])
final_predictions2 = pd.DataFrame({
    "id_num": test['id_num'],
    "political_affiliation_predicted": final_model_fit_poly.predict(X_test)
})
final_predictions2.to_csv("final_predictions2.csv", index=False)

In [19]:
# LR tune
pipeline_lr = Pipeline([
    ('preprocessing', ct),
    ('logistic', LogisticRegression(max_iter=10000, random_state=42))
])

# Hyperparameter grid
param_grid_logistic = {
    'logistic__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'logistic__penalty': ['l1', 'l2'],       # L1 = Lasso, L2 = Ridge
    'logistic__solver': ['liblinear', 'saga']  # Solvers that support L1 and L2
}

# Perform GridSearchCV
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_logistic, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

# Best Parameters and Cross-Validated Accuracy
best_params_lr= grid_search_lr.best_params_
best_score_lr = grid_search_lr.best_score_

print("Best Parameters for Logistic Regression:", best_params_lr)
print("Best Cross-Validated Accuracy for Logistic Regression:", best_score_lr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters for Logistic Regression: {'logistic__C': 1, 'logistic__penalty': 'l1', 'logistic__solver': 'saga'}
Best Cross-Validated Accuracy for Logistic Regression: 0.6444444444444445


# Saving as .pkl file

In [20]:
# Get the best model from grid search
best_model = grid_search_lr.best_estimator_

# Train it on the full dataset (X, y) not just (X_train, y_train)
best_model.fit(X, y)

0,1,2
,steps,"[('preprocessing', ...), ('logistic', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('dummify', ...), ('standardize', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,10000


In [28]:
import joblib

joblib.dump(best_model, 'models/political_affiliation/saved_model/logistic_regression_classifier.pkl')

['models/political_affiliation/saved_model/logistic_regression_classifier.pkl']

In [None]:
final_model_lr = grid_search_lr.best_estimator_
final_model_lr.fit(X_train, y_train)


X_test = test.drop(columns=['id_num'])
final_predictions3 = pd.DataFrame({
    "id_num": test['id_num'],
    "political_affiliation_predicted": final_model_lr.predict(X_test)
})
final_predictions3.to_csv("final_predictions3.csv", index=False)