In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from helpers import *
from implementations import *

## Load the training data

In [3]:
DATA_TRAIN_PATH = 'data/train.csv'
y_train, X_train, ids = load_csv_data(DATA_TRAIN_PATH)

## Load the testing data

In [4]:
DATA_TEST_PATH = 'data/test.csv' 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [5]:
X_train.shape, X_test.shape

((250000, 30), (568238, 30))

## Preprocessing and feature engineering

In [8]:
tX_train, ty_train, tX_test, ty_test, cont_features = preprocess(X_train, y_train, X_test, imputable_th=0.3, encodable_th=0.7, switch_encoding=True)

In [9]:
tX_train.shape, tX_test.shape

((250000, 24), (568238, 24))

## Model selection and hyperparameter tuning

In [10]:
param_grid = {
    'lambda_': np.logspace(-4, 0, 5),
    'degree': list(range(1, 4)),
    'max_iters': 100,
    'cont_features': [cont_features]
}
metrics, params = logistic_regression_cv(ty_train, tX_train, param_grid=param_grid)

In [11]:
metrics

{'loss': 0.48832368060485054,
 'accuracy': 76.5648,
 'f1_score': 0.6261136027521852}

## Run selected model

In [13]:
tX_train_poly = build_poly(tX_train, degree=params['degree'], cont_features=cont_features)
weights, loss = reg_logistic_regression(ty_train, tX_train_poly, max_iters=1000, lambda_=params['lambda_'])

In [14]:
ty_train_pred = predict_logistic(weights, tX_train_poly)
train_accuracy = compute_accuracy(ty_train, ty_train_pred)
train_f1 = compute_f1(ty_train, ty_train_pred)
train_accuracy, train_f1

(78.54560000000001, 0.6573436401967674)

## Prepare test data for prediction

In [15]:
tX_test_poly = build_poly(tX_test, params['degree'], cont_features)

In [16]:
tX_test_poly.shape

(568238, 44)

## Generate predictions and save ouput in csv format for submission:

In [17]:
from datetime import datetime
method = 'reg_logistic_regression'
time = datetime.now().strftime('%Y%m%dH%H%M%S')
OUTPUT_PATH = f'submissions/submission_{method}_{time}'
y_pred = predict_logistic(weights, tX_test_poly)
y_pred = replace_values(y_pred, from_val=0, to_val=-1)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)