In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit

%matplotlib inline

# Import Data

In [7]:
data = pd.read_csv('./data/train.csv')
data.drop('id', axis = 1, inplace = True)
X_train = np.asarray(data[data.columns[range(1, data.shape[1])]], dtype = np.double)
Y_train = np.asarray(data[['label']], dtype = np.double).ravel()
data.head()

Unnamed: 0,label,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,1,7,0,3,0,2,3,0,6,0,...,3,4,2,2,0,13,0,11,1,3
1,1,0,11,0,0,10,1,0,0,4,...,0,2,0,0,2,8,1,13,0,4
2,0,9,0,3,0,1,3,0,4,0,...,48,11,2,0,0,4,0,2,0,0
3,0,0,9,3,2,25,0,4,0,0,...,1,14,1,0,0,0,3,0,17,1
4,0,0,0,0,0,2,5,0,0,0,...,3,12,0,3,0,4,0,24,4,0


In [8]:
data.groupby(['label']).count()

Unnamed: 0_level_0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,327,327,327,327,327,327,327,327,327,327,...,327,327,327,327,327,327,327,327,327,327
1,573,573,573,573,573,573,573,573,573,573,...,573,573,573,573,573,573,573,573,573,573


# Fit Methods

In [10]:
estimators = [
    SGDClassifier(max_iter=5000, tol=0.1, penalty='l1', alpha=0.1),
    SVC(kernel='rbf', C = 0.1, gamma=2),
    LinearSVC(penalty = 'l2', dual=False), 
    LogisticRegression(penalty='l1', C=0.2),
]

In [11]:
cv_strategy = ShuffleSplit(n_splits=10, test_size=0.2, random_state=33)
for estimator in estimators:
    %time scoring = cross_val_score(estimator, X_train, Y_train, scoring='roc_auc', cv=cv_strategy)
    print ('%s %.3lf' % (str(estimator).replace('(',' ').split()[0], scoring.mean()), '%.4lf' % scoring.std())

Wall time: 731 ms
SGDClassifier 0.869 0.0200
Wall time: 1.96 s
SVC 0.877 0.0171
Wall time: 5.49 s
LinearSVC 0.913 0.0148
Wall time: 867 ms
LogisticRegression 0.923 0.0188


# Get final classifier

In [12]:
estimator = LogisticRegression(penalty='l1', C=0.2) #benchmark_1
estimator.fit(X_train, Y_train)

LogisticRegression(C=0.21, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Test data estimation

In [13]:
test = pd.read_csv('./data/test.csv')
X_test = np.asarray(test[test.columns[range(1, test.shape[1])]])

In [14]:
X_test

array([[ 4,  2,  2, ...,  6,  2,  3],
       [ 3,  3,  0, ..., 27,  2,  0],
       [ 0,  0,  0, ...,  5,  1,  1],
       ...,
       [ 1,  1,  0, ..., 36,  0,  2],
       [ 1,  3,  2, ..., 35,  1,  0],
       [ 0,  2,  3, ..., 15,  0,  0]], dtype=int64)

In [15]:
test_predict=estimator.predict(X_test)
test['label'] = test_predict
test[['id', 'label']].to_csv('benchmark_1.csv', sep = ',', index = False)