In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def preprocess_data(dataset):
  X = dataset.iloc[:, :-1]  # Features
  y = dataset.iloc[:, -1]   # Labels

  # Separate numerical and categorical features
  categorical_features = X.select_dtypes(include=['object']).columns
  numerical_features = X.drop(categorical_features, axis=1)
  categorical_features = X[categorical_features]

  # Split data into train and test sets with 20-80 split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

  # Encode categorical features using LabelEncoder
  label_encoder = LabelEncoder()
  for column in categorical_features.columns:
      X_train[column] = label_encoder.fit_transform(X_train[column])
      X_test[column] = label_encoder.transform(X_test[column])


  # Normalize data using StandardScaler for continuous features
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, X_test, y_train, y_test


Logistic Regression

In [3]:
def logistic_regression(X_train, X_test, y_train, y_test):
  clf = LogisticRegression(penalty='l2', random_state=42)
  clf.fit(X_train, y_train)
  return clf

K Nearest Number

In [4]:
def k_nearest_neighbor(X_train, X_test, y_train, y_test):
  clf = KNeighborsClassifier()
  clf.fit(X_train, y_train)
  return clf

Decision Tree

In [5]:
def decision_tree(X_train, X_test, y_train, y_test):
  clf = DecisionTreeClassifier(random_state=42)
  clf.fit(X_train, y_train)
  return clf

SVM

In [6]:
def support_vector_machine(X_train, X_test, y_train, y_test):
  clf = SVC(kernel='linear', C=1.0, random_state=42)
  clf.fit(X_train, y_train)
  return clf

Random Forest

In [7]:
def random_forest(X_train, X_test, y_train, y_test, n_estimators=100, max_depth=None):
  clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
  clf.fit(X_train, y_train)
  return clf

Boosting

In [8]:
def boosting(X_train, X_test, y_train, y_test, n_estimators=50, learning_rate=1.0):
  # AdaBoost with Decision Tree as base estimator
  base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)
  clf = AdaBoostClassifier(estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
  clf.fit(X_train, y_train)
  return clf

Cross Validation + Testing Results

In [15]:
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, cv=10):
    # Perform 10-fold cross-validation
    cv_results = cross_validate(clf, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], return_estimator=True)
    
    accuracy = cv_results['test_accuracy'].mean()
    precision = cv_results['test_precision'].mean()
    recall = cv_results['test_recall'].mean()
    f1 = cv_results['test_f1'].mean()
    auc = cv_results['test_roc_auc'].mean()

    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")
    print(f"Training F1 Score: {f1}")
    print(f"Training AUC: {auc}")

    # Find the model with the best accuracy on the validation set
    best_model_index = max(range(10), key=lambda i: cv_results['test_accuracy'][i])
    best_model = cv_results['estimator'][best_model_index]

    # Predict on the test set using the best model
    y_pred_test = best_model.predict(X_test)

    # Print metrics on the test set
    accuracy_test = accuracy_score(y_test, y_pred_test)

    print(f"Test Accuracy: {accuracy_test}")


dataset1 = pd.read_csv('project3_dataset1.txt', delimiter='\t')
dataset2 = pd.read_csv('project3_dataset2.txt', delimiter='\t')
X_train1, X_test1, y_train1, y_test1 = preprocess_data(dataset1)
X_train2, X_test2, y_train2, y_test2 = preprocess_data(dataset2)

print("Logistic Regression")
print("Dataset 1")
clf_lr1 = logistic_regression(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_lr1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_lr2 = logistic_regression(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_lr2, X_train2, X_test2, y_train2, y_test2)

print("\nK Nearest Neighbor")
print("Dataset 1")
clf_knn1 = k_nearest_neighbor(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_knn1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_knn2 = k_nearest_neighbor(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_knn2, X_train2, X_test2, y_train2, y_test2)

print("\nDecision Tree")
print("Dataset 1")
clf_dt1 = decision_tree(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_dt1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_dt2 = decision_tree(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_dt2, X_train2, X_test2, y_train2, y_test2)

print("\nSupport Vector Machine")
print("Dataset 1")
clf_svm1 = support_vector_machine(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_svm1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_svm2 = support_vector_machine(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_svm2, X_train2, X_test2, y_train2, y_test2)

print("\nRandom Forest")
print("Dataset 1")
clf_rf1 = random_forest(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_rf1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_rf2 = random_forest(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_rf2, X_train2, X_test2, y_train2, y_test2)

print("\nBoosting")
print("Dataset 1")
clf_boosting1 = boosting(X_train1, X_test1, y_train1, y_test1)
evaluate_classifier(clf_boosting1, X_train1, X_test1, y_train1, y_test1)
print("\nDataset 2")
clf_boosting2 = boosting(X_train2, X_test2, y_train2, y_test2)
evaluate_classifier(clf_boosting2, X_train2, X_test2, y_train2, y_test2)

Logistic Regression
Dataset 1
Training Accuracy (10-fold Cross-Validation): 0.9758454106280194

Test Accuracy: 0.9824561403508771
Test Precision: 1.0
Test Recall: 0.9487179487179487
Test F1 Score: 0.9736842105263158
Test AUC: 0.9743589743589743
Dataset 2
Training Accuracy (10-fold Cross-Validation): 0.7363363363363364

Test Accuracy: 0.7096774193548387
Test Precision: 0.7037037037037037
Test Recall: 0.5
Test F1 Score: 0.5846153846153846
Test AUC: 0.6772727272727272

K Nearest Neighbor
Dataset 1
Training Accuracy (10-fold Cross-Validation): 0.9756521739130435

Test Accuracy: 0.9385964912280702
Test Precision: 0.9444444444444444
Test Recall: 0.8717948717948718
Test F1 Score: 0.9066666666666667
Test AUC: 0.9225641025641026
Dataset 2
Training Accuracy (10-fold Cross-Validation): 0.673948948948949

Test Accuracy: 0.5913978494623656
Test Precision: 0.5
Test Recall: 0.3157894736842105
Test F1 Score: 0.3870967741935484
Test AUC: 0.5488038277511963

Decision Tree
Dataset 1
Training Accuracy (10