# 1. Logistic Regression

In this notebook we train a logistic regression classifier. The purpose is two-fold, to have a baseline classification to compare and to evaluate some of the feature engineering. 

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from process_data import process

In [3]:
from sklearn.linear_model import ElasticNetCV

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## 1.1 Original data set with minor data cleaning

In [5]:
data = pd.read_csv('data/train.csv')

In [6]:
process(data)

In [7]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [8]:
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]

In [9]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

#### Logistic Regression with Elastic Net

In [10]:
lg_classifiers = [] # list [[lg, score]]
for i in range(10):
    c = np.random.rand()
    lg = ElasticNetCV(l1_ratio=c, 
                      n_alphas=200, 
                      max_iter=2000, 
                      #tol=1e-4, 
                      normalize=True)
    lg.fit(X_train,Y_train)
    Y_pred = lg.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    lg_classifiers.append([lg, score])
    print('l1_ratio = {:.9f} --> auc_roc = {:.6f}'.format(c, score))

l1_ratio = 0.6866115388897356 --> auc_roc = 0.776399
l1_ratio = 0.8016197177628893 --> auc_roc = 0.776800
l1_ratio = 0.11353408975563128 --> auc_roc = 0.774496
l1_ratio = 0.11858459946208322 --> auc_roc = 0.774518
l1_ratio = 0.6722386791640659 --> auc_roc = 0.776319
l1_ratio = 0.685400847619186 --> auc_roc = 0.776396
l1_ratio = 0.9157647804499023 --> auc_roc = 0.777335
l1_ratio = 0.4138875106123888 --> auc_roc = 0.775716
l1_ratio = 0.6167556741695351 --> auc_roc = 0.776299
l1_ratio = 0.5361370994140355 --> auc_roc = 0.775769


Since the $l^1$-ratios with better performance are close to 1, we try to find a better ratio using a small perturbation.

In [11]:
lg_classifiers = [] # list [[lg, score]]
for _ in range(10):
    c = 1 - 0.2*np.random.rand()
    lg = ElasticNetCV(l1_ratio=c, 
                      n_alphas=200, 
                      max_iter=2000, 
                      #tol=1e-4, 
                      normalize=True)
    lg.fit(X_train,Y_train)
    Y_pred = lg.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    lg_classifiers.append([lg, score])
    print('l1_ratio = {:9.f} --> auc_roc = {:.6f}'.format(c, score))

l1_ratio = 0.9075163307400439 --> auc_roc = 0.777294
l1_ratio = 0.9727656375037396 --> auc_roc = 0.777528
l1_ratio = 0.8330539479068436 --> auc_roc = 0.776846
l1_ratio = 0.8832114385072016 --> auc_roc = 0.777159
l1_ratio = 0.8868300892944662 --> auc_roc = 0.777180
l1_ratio = 0.841417195642955 --> auc_roc = 0.776911
l1_ratio = 0.8904570981468333 --> auc_roc = 0.777201
l1_ratio = 0.8171150458225659 --> auc_roc = 0.776901
l1_ratio = 0.9968749834019057 --> auc_roc = 0.777693
l1_ratio = 0.9392125041743142 --> auc_roc = 0.777350


In [12]:
# save the classifiers for later use
pickle.dump(lg_classifiers, open("models/lg_classifier_param1.dat", 'wb'))

## 1.2 Data with 'saldo' Engineering

In [13]:
data_saldo = pd.read_csv('data/train_extended_saldo.csv')
# data already "processed"

In [14]:
# New feature counting zero entries
original_features = data.columns[:-1]
data.insert(len(original_features),'SumZeros',(data[original_features] == 0).sum(axis=1))

In [15]:
# New feature describing the number of assets
asset_features = [name for name in data.columns if 'ind' in name]
temp = data[asset_features].sum(axis=1)
data.insert(data.shape[1]-1, 'NumAssets', temp)

In [16]:
train, test = train_test_split(data_saldo, test_size = 0.2, random_state = 42)

In [17]:
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]

In [18]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

#### Logistic Regression with Elastic Net

In [19]:
lg_classifiers = [] # list [[lg, score]]
for _ in range(10):
    c = 1 - 0.2*np.random.rand()
    lg = ElasticNetCV(l1_ratio=c, 
                      n_alphas=200, 
                      max_iter=2000, 
                      #tol=1e-4, 
                      normalize=True)
    lg.fit(X_train,Y_train)
    Y_pred = lg.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    lg_classifiers.append([lg, score])
    print('l1_ratio = {:.9f} --> auc_roc = {:.6f}'.format(c, score))

l1 = 0.8219962396887162 --> auc_roc = 0.780461
l1 = 0.9828240252074011 --> auc_roc = 0.782009
l1 = 0.9754912109211129 --> auc_roc = 0.782059
l1 = 0.8915598639758243 --> auc_roc = 0.781197
l1 = 0.8055577354579954 --> auc_roc = 0.780244




l1 = 0.998665744457267 --> auc_roc = 0.782019
l1 = 0.8842628837116838 --> auc_roc = 0.781138
l1 = 0.8502035566026913 --> auc_roc = 0.780774
l1 = 0.9600876141148942 --> auc_roc = 0.781922
l1 = 0.8491149395157815 --> auc_roc = 0.780763


In [20]:
# save the classifiers for later use
pickle.dump(lg_classifiers, open("models/lg_extended_saldo_classifier_param1.dat", "wb"))

As we can see, the performance increased from ~0.775 to ~0.782. Therefore, the inclusion of the new variables seems to help.

### 1.2.1 Dropping the Old 'saldo' Features

In [21]:
data_saldo = pd.read_csv('data/train_saldo.csv')
# data already "processed" and old saldo features dropped

In [24]:
train, test = train_test_split(data_saldo, test_size = 0.2, random_state = 42)

In [25]:
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]

In [26]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

#### Logistic Regression with Elastic Net

In [27]:
lg_classifiers = [] # list [[lg, score]]
for _ in range(10):
    c = 1 - 0.2*np.random.rand()
    lg = ElasticNetCV(l1_ratio=c, 
                      n_alphas=200, 
                      max_iter=2000, 
                      #tol=1e-4, 
                      normalize=True)
    lg.fit(X_train,Y_train)
    Y_pred = lg.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    lg_classifiers.append([lg, score])
    print('l1_ratio = {:9f} --> auc_roc = {:.6f}'.format(c, score))

l1_ratio = 0.9586033793550597 --> auc_roc = 0.791220
l1_ratio = 0.8100157769934385 --> auc_roc = 0.790441
l1_ratio = 0.8993004744788721 --> auc_roc = 0.790897
l1_ratio = 0.8222336070271263 --> auc_roc = 0.790479
l1_ratio = 0.952958551306715 --> auc_roc = 0.791193
l1_ratio = 0.9943966190662108 --> auc_roc = 0.791415
l1_ratio = 0.9867113482390913 --> auc_roc = 0.791372
l1_ratio = 0.9295246885593057 --> auc_roc = 0.791032
l1_ratio = 0.9253136229778186 --> auc_roc = 0.791028
l1_ratio = 0.904533010435251 --> auc_roc = 0.790922


It appears that dropping the old 'saldo' features results in an even bigger increase in performance.

In [28]:
# save the classifiers for later use
pickle.dump(lg_classifiers, open("models/lg_saldo_classifier_param1.dat", "wb"))