# Stock version
This notebook is based on Kaggle solution https://www.kaggle.com/napetrov/tps04-svm-with-scikit-learn-intelex for Tabular Playground Series - Apr 2021

In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

Next set of cell read data and perform feature engineering operations

In [2]:
train = pd.read_csv('./SVM/train.csv', index_col='PassengerId')
test = pd.read_csv('./SVM/test.csv', index_col='PassengerId')

target = train.pop('Survived')

In [3]:
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [4]:
test_prepared = test.copy()
train_prepared = train.copy()

test_prepared['Age'].fillna((train['Age'].median()), inplace=True)
train_prepared['Age'].fillna((train['Age'].median()), inplace=True)

test_prepared['Fare'].fillna((train['Fare'].median()), inplace=True)
train_prepared['Fare'].fillna((train['Fare'].median()), inplace=True)

test_prepared['Embarked'].fillna('S', inplace=True)
train_prepared['Embarked'].fillna('S', inplace=True)


In [5]:
for col in ['Pclass', 'Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_prepared[col])
    train_prepared[col] = le.transform(train_prepared[col])
    test_prepared[col] = le.transform(test_prepared[col])

In [6]:
train_prepared.head()


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,39.0,2,0,27.14,2
1,2,1,39.0,0,0,13.35,2
2,2,1,0.33,1,2,71.29,2
3,2,1,19.0,0,0,13.04,2
4,2,1,25.0,0,0,7.76,2


In [7]:
train_prepared_scaled = train_prepared.copy()
test_prepared_scaled = test_prepared.copy()

scaler = StandardScaler()
scaler.fit(train_prepared)
train_prepared_scaled = scaler.transform(train_prepared_scaled)
test_prepared_scaled = scaler.transform(test_prepared_scaled)

train_prepared_scaled = pd.DataFrame(train_prepared_scaled, columns=train_prepared.columns)
test_prepared_scaled = pd.DataFrame(test_prepared_scaled, columns=train_prepared.columns)


In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(train_prepared_scaled, target, test_size=0.1, random_state=0)

And here we start trining for SVM with RBF kernel - it would take a while to complete

In [9]:
%%time
svc_kernel_rbf = SVC(kernel='rbf', random_state=0, C=0.01)
svc_kernel_rbf.fit(X_train, y_train)
y_pred = svc_kernel_rbf.predict(X_valid)
accuracy_score(y_pred, y_valid)

CPU times: user 12min 19s, sys: 311 ms, total: 12min 19s
Wall time: 12min 20s


0.7614

In [10]:
%%time
final_pred = svc_kernel_rbf.predict(test_prepared_scaled)

CPU times: user 5min 34s, sys: 44 ms, total: 5min 34s
Wall time: 5min 34s
