# Logistic Classification and SVM

In this notebook we analyze the performance of SVM and logistic regression classifiers.
We work with two data sets, train.csv and train_saldo.csv in order to compare.

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from process_data import process

In [4]:
from sklearn.linear_model import ElasticNetCV

In [19]:
from sklearn.svm import SVC
# the data doesn't seem to be linear so svc with other kernels should perform better

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Original data set with minor data cleaning

In [7]:
data = pd.read_csv('data/train.csv')

In [8]:
process(data)

In [9]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [10]:
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]

In [11]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

### Logistic Regression with Elastic Net

In [12]:
lg_classifiers = [] # list [[lg, score]]
for _ in range(10):
    l1 = np.random.rand()
    lg = ElasticNetCV(l1, n_alphas=200, max_iter=1000, tol=1e-4, normalize=True)
    lg.fit(X_train,Y_train)
    Y_pred = lg.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    lg_classifiers.append([lg, score])
    print('l1 = {} --> auc_roc = {}'.format(l1, score))

l1 = 0.503062458398916 --> auc_roc = 0.7757108358457353
l1 = 0.7088061293062852 --> auc_roc = 0.7763818568031909
l1 = 0.8420374910457782 --> auc_roc = 0.7769160325986055
l1 = 0.8420770050705878 --> auc_roc = 0.7769164840465628
l1 = 0.46407039051341126 --> auc_roc = 0.7756097115033115
l1 = 0.49800071513799826 --> auc_roc = 0.7756809274185675
l1 = 0.014719766813581892 --> auc_roc = 0.7750337767718514
l1 = 0.23098261285284338 --> auc_roc = 0.7755213405656801
l1 = 0.026785667339553454 --> auc_roc = 0.77808116334527
l1 = 0.19001397334145875 --> auc_roc = 0.7748064727253766


In [14]:
# save the classifiers for later use
pickle.dump(lg_classifiers, open("models/lg_classifier_param1.dat", 'wb'))

### Support Vector Machine

In [None]:
svm_classifiers = [] # list [[svm, score]]
for _ in range(10):
    C = np.random.randint(1,101)
    svm = SVC(C, tol=1e-3, kernel='linear', class_weight='balanced')
    svm.fit(X_train,Y_train)
    Y_pred = svm.predict(X_test)
    score = roc_auc_score(Y_test,Y_pred)
    svm_classifiers.append([lg, score])
    print('C = {} --> auc_roc = {}'.format(C, score))

In [None]:
# save the classifiers for later use
pickle.dump(svm_classfiers, open("models/svm_classifier_param1.dat", 'wb'))

## Data with saldo engineering

In [None]:
data_saldo = pd.read_csv('data/train_saldo.csv')

In [None]:
# New feature counting zero entries
original_features = data.columns[:-1]
data.insert(len(original_features),'SumZeros',(data[original_features] == 0).sum(axis=1))b

In [None]:
# New feature describing the number of assets
asset_features = [name for name in data.columns if 'ind' in name]
temp = data[asset_features].sum(axis=1)
data.insert(data.shape[1]-1, 'NumAssets', temp)