In [1]:
from helpers import load_csv_data, create_csv_submission
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from common import batch_iter, sigmoid
from mie import build_k_indices, cross_validation, logistic_loss, train_val_split, subsample_class
from performance import accuracy, f1_score

Load data

In [2]:
x_train_norm = np.load("processed/x_train.npy")
x_test_norm = np.load("processed/x_test.npy")
y_train = np.load("processed/y_train.npy")
test_ids = np.load("processed/test_ids.npy")
train_ids = np.load("processed/train_ids.npy")

Logistic regression

In [3]:
# Split
X_train, y_train, X_val, y_val = train_val_split(x_train_norm, y_train, val_ratio=0.2)

# Subsampling
X_bal, y_bal = subsample_class(X_train, y_train, target_ratio=1.0)


Cross validation

In [4]:
k_fold=5
k_indices=build_k_indices(y_bal, k_fold, seed=42)
loss_val=[]
loss_tr=[]
ws=[]
initial_w = np.zeros((X_bal.shape[1], 1))

for k in range(k_fold):
    w, loss_tr_tmp=cross_validation(y_bal, X_bal, k_indices, k, initial_w, max_iters=1000, gamma=0.001, lambda_=0.2)
    loss_tr.append(loss_tr_tmp)
    ws.append(w)
    loss_val_tmp=logistic_loss(y_val, X_val, w)
    loss_val.append(loss_val_tmp)
    
w_best=np.mean(ws, axis=0)

Test

In [5]:
y_pred_prob = sigmoid(x_test_norm @ w_best)
y_pred = np.where(y_pred_prob >= 0.5, 1, -1)

create_csv_submission(test_ids, y_pred, 'Reg_Logistic_1')