In [None]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
def run(fold, df, penalty="l2", c=1, output=True): 

    # Split train and validation
    df_train = df[df.fold != fold].reset_index(drop=True)
    df_valid = df[df.fold == fold].reset_index(drop=True)

    # Drop label for training and validation
    x_train = df_train.drop(["target", "fold"], axis=1).values
    y_train = df_train.target.values

    x_valid = df_valid.drop(["target", "fold"], axis=1).values
    y_valid = df_valid.target.values

    # Initiate logistic regression
    model = LogisticRegression(penalty=penalty, C=c, solver='saga')
    model.fit(x_train, y_train)

    # Generate predictions
    train_preds = model.predict(x_train)
    val_preds = model.predict(x_valid)

    # Calculate and print accuracy
    train_accuracy = accuracy_score(y_train, train_preds)
    val_accuracy = accuracy_score(y_valid, val_preds)
    if output: 
        print(f"Fold={fold}, Val Acc={val_accuracy}, Train acc={train_accuracy}")
    return val_accuracy, train_accuracy

In [None]:
# Load data
train_data = pd.read_csv("train.csv")

In [None]:
# Baseline model 
valid_acc = []
train_acc = []
for i in range(5): 
    v, t = run(i, train_data) 
    valid_acc.append(v) 
    train_acc.append(t)
print(f"Average val acc={np.mean(valid_acc)}")
print(f"Average train acc={np.mean(train_acc)}")

In [None]:
# Hyperparameter tuning
reg = ['l1', 'l2']
c_choices = [1000, 100, 10, 1, 0.1] # C = 1/lambda
avg_val_acc = {}
avg_train_acc = {}
for r in reg: 
    for c in c_choices: 
        curr_val = []
        curr_train = []
        for i in range(5): 
            v, t = run(i, train_data, r, c, False) 
            curr_val.append(v) 
            curr_train.append(t)
        avg_val_acc[(r, c)] = np.mean(curr_val)
        avg_train_acc[(r, c)] = np.mean(curr_train)
        print(f"({r}, {c}):")
        print(f"\tTrain: {avg_train_acc[(r, c)]:.5f}")
        print(f"\tVal: {avg_val_acc[(r, c)]:.5f}")

In [None]:
for item in avg_val_acc: 
    print(item, avg_val_acc[item])

In [None]:
test_data = pd.read_csv("test.csv")

In [None]:
x_test = test_data.drop(["target"], axis=1).values
y_test = test_data.target.values

In [None]:
final_model = LogisticRegression(penalty='l2', C=1, solver='saga')
final_model.fit(
    train_data.drop(["target", "fold"], axis=1).values,
    train_data.target.values
)

In [None]:
accuracy_score(y_test, final_model.predict(x_test))