In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import catboost as cat
import xgboost as xgb
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
# path = os.path.join('/', 'data', 'pik_competition', 'sev')
path = '/home/pik/workspace/python_projects/machine_learning/data'
train_path = os.path.join(path, 'xtrain.csv')
test_path = os.path.join(path, 'xtest.csv')
y_train_path = os.path.join(path, 'ytrain.csv')


X_train_df = pd.read_csv(train_path, header=0)
# X_test_df = pd.read_csv(test_path, header=0)
y_train_df = pd.read_csv(y_train_path, header=0)

In [5]:
X_train_df.shape

(900000, 58)

In [6]:
X_train_df['label'] = y_train_df
X_df = X_train_df.dropna(how='any').copy()
y_df = X_df['label']
X_df = X_df.drop('label', axis=1)

In [7]:
X_df.shape

(2462, 58)

In [8]:
X_df.head().T

Unnamed: 0,72,238,384,764,1566
1,209.606526,47.849207,-165.215629,-16.329755,-29.210591
2,12.0,12.0,13.0,12.0,13.0
3,12.0,11.0,10.0,9.0,12.0
4,32.55257,54.547042,2.627286,-16.914013,3.017234
5,0.0,0.0,0.0,0.0,1.0
6,2.0,2.0,2.0,2.0,3.0
7,23.0,23.0,24.0,23.0,23.0
8,-75.465409,-71.092011,-22.863169,-36.600166,-47.469785
9,7.0,8.0,7.0,9.0,8.0
10,1.0,0.0,0.0,0.0,0.0


In [13]:
y_df.head()

72      0
238     0
384     0
764     0
1566    1
Name: label, dtype: int64

In [9]:
unique = X_df.nunique()
dumming_indexes = unique[unique < 20].index
undumming_indexes = unique[unique > 20].index

In [10]:
dumming_indexes

Index(['2', '3', '5', '6', '7', '9', '10', '15', '17', '18', '21', '22', '23',
       '24', '26', '27', '29', '30', '31', '32', '36', '44', '45', '47', '48',
       '50', '51'],
      dtype='object')

In [11]:
undumming_indexes

Index(['1', '4', '8', '11', '12', '13', '14', '16', '19', '20', '25', '28',
       '33', '34', '35', '37', '38', '39', '40', '41', '42', '43', '46', '49',
       '52', '53', '54', '55', '56', '57', '58'],
      dtype='object')

In [12]:
X_df = pd.get_dummies(X_df, columns=dumming_indexes)

In [13]:
X_df.shape

(2462, 264)

In [14]:
X_scal = X_df[undumming_indexes]

scaler = StandardScaler()
X_scal = scaler.fit_transform(X_scal)

X_scal = pd.DataFrame(X_scal, columns=undumming_indexes)

In [15]:
X_df.drop(undumming_indexes, axis=1, inplace=True)

In [16]:
X_df.shape, X_scal.shape

((2462, 233), (2462, 31))

In [17]:
X_df.head()

Unnamed: 0,2_9.0,2_10.0,2_11.0,2_12.0,2_13.0,3_4.0,3_6.0,3_7.0,3_8.0,3_9.0,...,51_13.0,51_14.0,51_15.0,51_16.0,51_17.0,51_18.0,51_19.0,51_20.0,51_21.0,51_22.0
72,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
384,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
764,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1566,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_scal.head()

Unnamed: 0,1,4,8,11,12,13,14,16,19,20,...,43,46,49,52,53,54,55,56,57,58
0,1.985651,0.33411,-1.472312,-0.962617,-1.165195,-2.637123,0.943628,0.895383,0.994858,0.067007,...,-0.456178,0.751114,-0.087629,0.546373,-0.986906,0.548042,0.962922,-0.773534,1.244602,-1.121793
1,0.374451,1.390043,-1.251238,0.747414,0.253489,0.282915,-0.512995,0.892249,1.166585,-0.763177,...,-0.790376,-0.019965,-0.802614,0.492022,0.567347,0.814145,-0.854648,0.302431,0.774502,-0.125266
2,-1.747802,-1.102574,1.186711,2.029914,0.863959,0.711773,0.872784,-0.666158,-0.11883,1.49404,...,-1.201521,-0.829635,0.078523,0.126897,-0.546757,-1.322917,0.805711,-0.583597,0.651971,0.434307
3,-0.26481,-2.040733,0.492311,-0.711684,1.257903,1.181897,1.3142,0.792554,0.441762,-0.784012,...,-0.775978,-1.043573,-1.309265,-0.533059,0.474406,0.619948,-0.60495,0.283375,0.311852,1.005327
4,-0.393111,-1.083853,-0.057144,-0.131479,0.439045,-0.865709,-1.875318,-0.662809,-0.173509,-0.309858,...,-0.772584,-0.420693,-0.965562,-0.783661,0.408204,0.286593,1.246056,0.215297,0.415724,1.905606


In [19]:
X_df.index = [i for i in range(X_df.shape[0])]

X_scal.index = [i for i in range(X_df.shape[0])]

X = pd.concat([X_df, X_scal], axis=1).copy()

In [20]:
X.shape

(2462, 264)

In [21]:
X.head()

Unnamed: 0,2_9.0,2_10.0,2_11.0,2_12.0,2_13.0,3_4.0,3_6.0,3_7.0,3_8.0,3_9.0,...,43,46,49,52,53,54,55,56,57,58
0,0,0,0,1,0,0,0,0,0,0,...,-0.456178,0.751114,-0.087629,0.546373,-0.986906,0.548042,0.962922,-0.773534,1.244602,-1.121793
1,0,0,0,1,0,0,0,0,0,0,...,-0.790376,-0.019965,-0.802614,0.492022,0.567347,0.814145,-0.854648,0.302431,0.774502,-0.125266
2,0,0,0,0,1,0,0,0,0,0,...,-1.201521,-0.829635,0.078523,0.126897,-0.546757,-1.322917,0.805711,-0.583597,0.651971,0.434307
3,0,0,0,1,0,0,0,0,0,1,...,-0.775978,-1.043573,-1.309265,-0.533059,0.474406,0.619948,-0.60495,0.283375,0.311852,1.005327
4,0,0,0,0,1,0,0,0,0,0,...,-0.772584,-0.420693,-0.965562,-0.783661,0.408204,0.286593,1.246056,0.215297,0.415724,1.905606


In [22]:
X.describe()

Unnamed: 0,2_9.0,2_10.0,2_11.0,2_12.0,2_13.0,3_4.0,3_6.0,3_7.0,3_8.0,3_9.0,...,43,46,49,52,53,54,55,56,57,58
count,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,...,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0,2462.0
mean,0.003656,0.025183,0.113729,0.355402,0.502031,0.000406,0.002843,0.006905,0.038993,0.078798,...,-6.031821e-16,2.5974350000000002e-17,2.626295e-16,3.174643e-17,1.9480760000000003e-17,2.886039e-17,4.6176620000000004e-17,-1.464665e-16,-5.919987e-16,1.601751e-16
std,0.060363,0.156712,0.317546,0.478732,0.500097,0.020154,0.053257,0.082825,0.193617,0.269478,...,1.000203,1.000203,1.000203,1.000203,1.000203,1.000203,1.000203,1.000203,1.000203,1.000203
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.590536,-3.480956,-3.131988,-3.17022,-3.242362,-3.49989,-3.594905,-3.174426,-3.815671,-3.776249
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.6873704,-0.6713191,-0.6694958,-0.6753193,-0.6673347,-0.6954766,-0.67936,-0.6849623,-0.6842978,-0.6853757
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.01131646,-0.01808468,0.003263782,-0.01367348,-0.001290789,0.03327346,-0.004646301,0.009101701,0.0312481,0.01103283
75%,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.6828177,0.6595718,0.6530675,0.6819998,0.6885951,0.6817303,0.679563,0.6475737,0.6619558,0.6762005
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.655688,3.647957,3.270108,3.4764,3.101285,3.491198,3.021571,4.232354,3.242054,3.242568


In [23]:
X = X.values
y = y_df.values.ravel()

In [25]:
kf = KFold(n_splits=5, shuffle=True, random_state=241)

In [27]:
train_scores = list()
test_scores = list()

params = [0.1, 1, 10, 50, 100, 150, 200, 250, 1000, 10000, 1000000]

for param in params:
    f_train_scores = list()
    f_test_scores = list()
    for train_index, test_index in kf.split(X):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        
        model = SVC(C=param, kernel='linear', random_state=1)
        model.fit(X_train, y_train)
        f_train_scores.append(roc_auc_score(model.predict(X_train), y_train))
        f_test_scores.append(roc_auc_score(model.predict(X_test), y_test))
        
    train_scores.append(np.array(f_train_scores).mean())
    test_scores.append(np.array(f_train_scores).mean())

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
plt.plot(params, train_scores, label='train')
plt.plot(params, test_scores, label='test')
plt.grid()
plt.legend()