In this notebook we analyze the performance of SVM and logistic regression classifiers.
We work with two data sets, train.csv and train_saldo.csv in order to compare.

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [42]:
from process_data import process

In [43]:
from sklearn.linear_model import ElasticNetCV

In [44]:
from sklearn.svm import LinearSVC 

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Original data set with minor data cleaning

In [46]:
data = pd.read_csv('data/train.csv')

In [47]:
process(data)

In [48]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [49]:
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]

In [50]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

In [51]:
scores = {}

In [54]:
for _ in range(10):
    l1 = np.random.rand()
    lg = ElasticNetCV(l1, normalize=True)
    lg.fit(X_train,Y_train)
    Y_prob = lg.predict(X_test)
    scores[l1] = roc_auc_score(Y_test,Y_prob)
    print('l1 = {} --> auc_roc = {}'.format(l1, scores[l1]))

l1 = 0.9414741921699933 --> 



auc_roc = 0.7772643246976229
l1 = 0.07711158588223765 --> auc_roc = 0.7755159231901931
l1 = 0.6003963224868567 --> auc_roc = 0.7761104237188952
l1 = 0.18774531004231587 --> auc_roc = 0.7748835574640769
l1 = 0.6018669548166545 --> auc_roc = 0.7761256600874522
l1 = 0.6377288916538728 --> auc_roc = 0.7763381792133272
l1 = 0.3064339834868276 --> auc_roc = 0.7759460402314617
l1 = 0.4717439229804775 --> auc_roc = 0.7755829632118445
l1 = 0.7553462624524657 --> auc_roc = 0.7764316289404776
l1 = 0.9696597210357096 --> 



auc_roc = 0.777441066572886


## Data with saldo engineering

In [11]:
data_saldo = pd.read_csv('data/train_saldo.csv')

In [None]:
# New feature counting zero entries
original_features = data.columns[:-1]
data.insert(len(original_features),'SumZeros',(data[original_features] == 0).sum(axis=1))b

In [None]:
# New feature describing the number of assets
asset_features = [name for name in data.columns if 'ind' in name]
temp = data[asset_features].sum(axis=1)
data.insert(data.shape[1]-1, 'NumAssets', temp)