# SVM + cross-val

In [None]:
%%capture
!python -m pip install --upgrade pip
!pip install --upgrade scikit-learn-intelex

In [None]:
# Запатчим sklearn для быстрой работы

from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import site
from matplotlib import pyplot as plt

sys.path.append(os.path.join(os.path.dirname(site.getsitepackages()[0]), "site-packages"))

In [None]:
# Загрузим и посмотрим на дату

df = pd.read_csv("https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/data.adult.csv")
df.head(10)

In [None]:
print(df[df == '?'].count())

In [None]:
df = df.query("occupation != '?'")
print(df[df == '?'].count())

In [None]:
# Выделим целевую переменную, названия категориальных и вещественных колонок

y = df['>50K,<=50K'].map({'>50K':1 ,'<=50K':0})

cat_col = df.select_dtypes(include=['object']).columns.tolist()
num_col = df.select_dtypes(exclude=['object']).columns.tolist()

x = df[num_col]
print(x)

In [None]:
# Посмотрим на гистограммы по признакам объектов

plt.hist(df['age'], bins=15, edgecolor='black')
plt.show()
plt.hist(df['fnlwgt'], bins=15, edgecolor='black')
plt.show()
plt.hist(df['capital-gain'], bins=15, edgecolor='black')
plt.show()
plt.hist(df['education-num'], bins=15, edgecolor='black')
plt.show()
plt.hist(df['capital-loss'], bins=15, edgecolor='black')
plt.show()
plt.hist(df['hours-per-week'], bins=15, edgecolor='black')
plt.show()


In [None]:
# Отскейлим дату с помощью StandardScaler 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)

print(X)

In [None]:
# Посмотрим как справится логистическая регресия и svm в кроссвалидации

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

cross_val_score(LogisticRegression(), X, y, cv=3, scoring='roc_auc', verbose=2).mean()

In [None]:
cross_val_score(SVC(kernel='rbf', C = 3.5), X, y, cv=3, scoring='roc_auc', verbose=2).mean()

In [None]:
# Подберем C у логистической регрессии и ядро svm

from sklearn.model_selection import GridSearchCV

params = {'C' : [1, 2, 3]}

gs = GridSearchCV(LogisticRegression(), params, cv=3, scoring='roc_auc')
gs.fit(X, y)

print(gs.best_score_)
print(gs.best_params_) 

In [None]:
params = {'kernel' : ['rbf','poly','sigmoid','linear'],
          'C' : np.arange(0.01, 2.01, 0.25)} 

gs = GridSearchCV(SVC(), params, cv=3, scoring='roc_auc')
gs.fit(X, y) 

print(gs.best_score_) 
print(gs.best_params_) 

In [None]:
# Добавим к вещественным признакам категориальные и посмотрим на качество модели на кроссвалидации

X_old = df.drop('>50K,<=50K', axis=1)

X_new = pd.get_dummies(X_old, columns = X_old.select_dtypes(include=['object']).columns.tolist(), dtype = float, drop_first = True)
X_new = pd.DataFrame(scaler.fit_transform(X_new), columns = X_new.columns)
X_new.head()

In [None]:
cross_val_score(LogisticRegression(C = 1), X_new, y, cv=3, scoring='roc_auc', verbose=2).mean()

In [None]:
cross_val_score(SVC(kernel='linear', C = 0.01), X_new, y, cv=3, scoring='roc_auc', verbose=2).mean()

In [None]:
# Разделим дату на трейн и тест, обучим модель с наилучшими показателями и построим roc_curve

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.25, random_state= 42)
model = SVC(kernel='linear', C = 0.01)
model.fit(X_train, y_train)
pred = model.predict_proba(X_test)[:,1]
roc_auc_score(y_test, pred)

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(y_test, pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, 'b')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()