In [106]:
import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [107]:
def preprocess_data(dataset):
  X = dataset.iloc[:, :-1]  # Features
  y = dataset.iloc[:, -1]   # Classification

  # Separate numerical and categorical features
  categorical_features = X.select_dtypes(include=['object']).columns
  numerical_features = X.drop(categorical_features, axis=1)
  categorical_features = X[categorical_features]

  # 80-20 train-test split of data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

  # Encode categorical features using LabelEncoder
  label_encoder = LabelEncoder()
  for column in categorical_features.columns:
      X_train[column] = label_encoder.fit_transform(X_train[column])
      X_test[column] = label_encoder.transform(X_test[column])


  # Normalize data using StandardScaler for continuous features
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, X_test, y_train, y_test


Cross Validation + Metrics

In [108]:
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, cv=10):
    # 10-fold cross-validation
    cv_results = cross_validate(clf, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], return_estimator=True)

    # metrics on the training set
    accuracy = cv_results['test_accuracy'].mean()
    precision = cv_results['test_precision'].mean()
    recall = cv_results['test_recall'].mean()
    f1 = cv_results['test_f1'].mean()
    auc = cv_results['test_roc_auc'].mean()
    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")
    print(f"Training F1 Score: {f1}")
    print(f"Training AUC: {auc}")

    # identify model with the best accuracy on the training set
    best_model_index = max(range(10), key=lambda i: cv_results['test_accuracy'][i])
    best_model = cv_results['estimator'][best_model_index]

    # predict on the test set using the best model
    y_pred_test = best_model.predict(X_test)

    # metrics on the test set
    accuracy_test = accuracy_score(y_test, y_pred_test)
    print(f"Test Accuracy: {accuracy_test}")

Logistic Regression

In [109]:
def logistic_regression(X_train, y_train, reg_technique, reg_solver):
  clf = LogisticRegression(penalty=reg_technique, solver=reg_solver)
  clf.fit(X_train, y_train)
  return clf

K Nearest Number

In [110]:
def k_nearest_neighbor(X_train, y_train, k_value):
  clf = KNeighborsClassifier(n_neighbors=k_value)
  clf.fit(X_train, y_train)
  return clf

SVM

In [111]:
def support_vector_machine(X_train, y_train, kernel_func):
  clf = SVC(kernel=kernel_func)
  clf.fit(X_train, y_train)
  return clf

Decision Tree

In [112]:
def decision_tree(X_train, y_train, depth):
  clf = DecisionTreeClassifier(max_depth=depth)
  clf.fit(X_train, y_train)
  return clf

Random Forest

In [113]:
def random_forest(X_train, y_train, trees):
  clf = RandomForestClassifier(n_estimators=trees)
  clf.fit(X_train, y_train)
  return clf

Boosting

In [114]:
def boosting(X_train, y_train):
  # AdaBoost with Decision Tree as base estimator
  base_estimator = DecisionTreeClassifier(max_depth=1)
  clf = AdaBoostClassifier(estimator=base_estimator)
  clf.fit(X_train, y_train)
  return clf

Training + Testing Model Metrics on Datasets

In [None]:
dataset1 = pd.read_csv('project3_dataset1.txt', delimiter='\t')
dataset2 = pd.read_csv('project3_dataset2.txt', delimiter='\t')
X_train1, X_test1, y_train1, y_test1 = preprocess_data(dataset1)
X_train2, X_test2, y_train2, y_test2 = preprocess_data(dataset2)

print("Logistic Regression")
print("Dataset 1")
clf_lr1 = logistic_regression(X_train1, y_train1, 'l2', 'liblinear')
evaluate_classifier(clf_lr1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_lr2 = logistic_regression(X_train2, y_train2, 'l1', 'liblinear')
evaluate_classifier(clf_lr2, X_train2, X_test2, y_train2, y_test2)

print("\nK Nearest Neighbor")
print("Dataset 1")
clf_knn1 = k_nearest_neighbor(X_train1, y_train1, 5)
evaluate_classifier(clf_knn1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_knn2 = k_nearest_neighbor(X_train2, y_train2, 26)
evaluate_classifier(clf_knn2, X_train2, X_test2, y_train2, y_test2)

print("\nSupport Vector Machine")
print("Dataset 1")
clf_svm1 = support_vector_machine(X_train1, y_train1, 'linear')
evaluate_classifier(clf_svm1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_svm2 = support_vector_machine(X_train2, y_train2, 'rbf')
evaluate_classifier(clf_svm2, X_train2, X_test2, y_train2, y_test2)

print("\nDecision Tree")
print("Dataset 1")
clf_dt1 = decision_tree(X_train1, y_train1, 3)
evaluate_classifier(clf_dt1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_dt2 = decision_tree(X_train2, y_train2, 3)
evaluate_classifier(clf_dt2, X_train2, X_test2, y_train2, y_test2)

print("\nRandom Forest")
print("Dataset 1")
clf_rf1 = random_forest(X_train1, y_train1, 40)
evaluate_classifier(clf_rf1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_rf2 = random_forest(X_train2, y_train2, 40)
evaluate_classifier(clf_rf2, X_train2, X_test2, y_train2, y_test2)

print("\nBoosting")
print("Dataset 1")
clf_boosting1 = boosting(X_train1, y_train1)
evaluate_classifier(clf_boosting1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_boosting2 = boosting(X_train2, y_train2)
evaluate_classifier(clf_boosting2, X_train2, X_test2, y_train2, y_test2)

Logistic Regression
Dataset 1
Training Accuracy: 0.9758454106280194
Training Precision: 0.9791812865497075
Training Recall: 0.9588235294117646
Training F1 Score: 0.9673966935151146
Training AUC: 0.9965155027528253
Test Accuracy: 0.9824561403508771

Dataset 2
Training Accuracy: 0.7361861861861863
Training Precision: 0.6574206349206351
Training Recall: 0.4685897435897436
Training F1 Score: 0.5358642819169136
Training AUC: 0.7652521367521368
Test Accuracy: 0.7204301075268817

K Nearest Neighbor
Dataset 1
Training Accuracy: 0.9756521739130435
Training Precision: 0.99375
Training Recall: 0.9411764705882353
Training F1 Score: 0.9660984848484848
Training AUC: 0.9917379020573748
Test Accuracy: 0.9385964912280702

Dataset 2
Training Accuracy: 0.7171171171171171
Training Precision: 0.602857142857143
Training Recall: 0.2891025641025641
Training F1 Score: 0.3791205397629546
Training AUC: 0.7441207264957266
Test Accuracy: 0.6881720430107527

Support Vector Machine
Dataset 1
Training Accuracy: 0.973