In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from helpers import *
from implementations import *

## Load the training data

In [4]:
DATA_TRAIN_PATH = 'data/train.csv'
y_train, X_train, ids = load_csv_data(DATA_TRAIN_PATH)

## Load the testing data

In [5]:
DATA_TEST_PATH = 'data/test.csv' 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [6]:
X_train.shape, X_test.shape

((250000, 30), (568238, 30))

## Preprocessing and feature engineering

In [26]:
tX_train_linear, ty_train_linear, tX_test_linear, ty_test_linear, cont_features_linear = preprocess(X_train, y_train, X_test, imputable_th=0.3, encodable_th=0.7, switch_encoding=False)

In [21]:
tX_train.shape, tX_test.shape

((250000, 24), (568238, 24))

## Model selection and hyperparameter tuning

In [22]:
least_squares(y_train, X_train)

(array([ 8.03494319e-05, -7.20202272e-03, -6.05417273e-03, -5.47559067e-04,
        -1.93874699e-02,  4.73451619e-04, -2.60379055e-02,  3.25106300e-01,
        -3.80780232e-05, -2.72736393e+00, -2.21220140e-01,  9.50794092e-02,
         6.40351615e-02,  2.73562361e+00, -3.31801214e-04, -9.54325125e-04,
         2.74038035e+00, -5.34164961e-04,  9.73498639e-04,  3.69225052e-03,
         3.54487397e-04, -5.43344601e-04, -3.30448035e-01, -1.40800497e-03,
         8.31432879e-04,  1.02117272e-03, -1.68047417e-03, -5.83664813e-03,
        -1.11087999e-02,  2.72782386e+00]),
 0.3396868094770736)

In [31]:
param_grid = {

    'max_iters': 100,
    'cont_features': [cont_features_linear]
}
metrics_ls, params_ls = least_squares_cv(ty_train_linear, tX_train_linear, param_grid)

LinAlgError: Singular matrix

In [24]:
metrics_ls

{'loss': 0.3397894343482778, 'accuracy': 74.43039999999999, 'f1_score': 1.0}

In [22]:
metrics_ls_gd, params_ls_gd = grid_search_cv(y_train, X_train, model_fn=least_squares_GD, loss_fn=compute_mse, predict_fn=predict_linear,
                          param_grid=linear_param_grid, transform_fn=build_poly, k_fold=5, seed=1)

In [25]:
metrics_ls_gd

{'loss': nan, 'accuracy': 0.0, 'f1_score': nan}

In [20]:
metrics_ls_SGD, params_ls_SGD = grid_search_cv(y_train, X_train, model_fn=least_squares_SGD, loss_fn=compute_mse, predict_fn=predict_linear,
                          param_grid=linear_param_grid, transform_fn=build_poly, k_fold=5, seed=1)

In [26]:
metrics_ls_SGD

{'loss': nan, 'accuracy': 0.0, 'f1_score': nan}

In [18]:
ridge_param_grid = {
    'lambda_': np.logspace(-4, 0, 5),
    'degree': list(range(1, 4)),
    'max_iters': 100,
    'cont_features': [cont_features]
}
metrics_ridge, params_ridge = ridge_regression_cv(ty_train, tX_train, param_grid=param_grid)


In [19]:
metrics_ridge

{'loss': 0.35243739600033613, 'accuracy': 73.34799999999998, 'f1_score': 1.0}

In [21]:
log_param_grid = {
    'lambda_': np.logspace(-4, 0, 5),
    'degree': list(range(1, 4)),
    'max_iters': 100,
    'cont_features': [cont_features]
}
metrics_rlg, params_rlg = logistic_regression_cv(ty_train, tX_train, param_grid=log_param_grid)


In [22]:
metrics_rlg

{'loss': 0.48832368060485065,
 'accuracy': 76.5648,
 'f1_score': 0.6261136027521852}

In [35]:
metrics_lg, params_lg = logistic_regression_cv(ty_train, tX_train, {'lambda_':0, 'degree': list(range(1, 4)),
    'max_iters': 100,
    'cont_features': [cont_features]})

In [36]:
metrics_lg

{'loss': 0.496098732815953,
 'accuracy': 76.1192,
 'f1_score': 0.6243271235036281}

## Run selected model

In [15]:
tX_train_poly_rlg = build_poly(tX_train, degree=params_rlg['degree'], cont_features=cont_features)
weights_rlg, loss_rlg = reg_logistic_regression(ty_train, tX_train_poly_rlg, max_iters=1000, lambda_=params_rlg['lambda_'])

In [16]:
ty_train_pred = predict_logistic(weights_rlg, tX_train_poly_rlg)
train_accuracy = compute_accuracy(ty_train, ty_train_pred)
train_f1 = compute_f1(ty_train, ty_train_pred)
train_accuracy, train_f1

(78.54560000000001, 0.6573436401967674)

## Prepare test data for prediction

In [17]:
tX_test_poly = build_poly(tX_test, params['degree'], cont_features)

In [None]:
tX_test_poly.shape

(568238, 44)

## Generate predictions and save ouput in csv format for submission:

In [None]:
from datetime import datetime
method = 'reg_logistic_regression'
time = datetime.now().strftime('%Y%m%dH%H%M%S')
OUTPUT_PATH = f'submissions/submission_{method}_{time}'
y_pred = predict_logistic(weights, tX_test_poly)
y_pred = replace_values(y_pred, from_val=0, to_val=-1)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)