In [4]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# для разделения df на train и test выборку
from sklearn.model_selection import train_test_split
# для подготовки данных
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# перекрестная проверка модели
from sklearn.model_selection import cross_val_score

# # модели
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm #импортируем нашу модель

# from sklearn.model_selection import validation_curve # для исследования модели
from sklearn.model_selection import StratifiedKFold # для стратифицированной выборки
# from sklearn.model_selection import RandomizedSearchCV #импортирует случайный поиск гиперпараметров
from sklearn.model_selection import GridSearchCV # полный перебор гиперпараметров
from sklearn import metrics # импортируем метрики

In [2]:
df = pd.read_csv('datasets/Titanic.csv')

In [3]:
X = df.drop("Survived", axis = 1).drop("Cabin", axis = 1).drop("Name", axis = 1).drop("Ticket", axis=1)
y = df["Survived"].copy()
X = X.set_index("PassengerId")
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]
# обработка данных перед обучением

# вставка отсутсвующих значений и маштабирование
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])
X_prepared = preprocess_pipeline.fit_transform(X[num_attribs + cat_attribs])
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)



In [5]:
svm_clf = svm.SVC() #зададим модель

In [6]:
svm_clf.get_params() # дефолт модель

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [7]:
svm_clf01 = svm.SVC(kernel='linear', C=1, gamma=1) #зададим начальные параметры для моделиz

In [8]:
svm_clf01.get_params() # модель с ГП из практики

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 1,
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [9]:
svm_clf.fit(X_train , y_train) #выполним обучение модели при начальных параметрах

In [10]:
model_params = svm_clf.get_params() #зададим параметры по которым и будем осуществлять поиск
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['gamma'] = range(1, 100)

In [12]:
tuned_params['kernel'].append('linear')

In [13]:
tuned_params

{'C': [1.0],
 'break_ties': [False],
 'cache_size': [200],
 'class_weight': [None],
 'coef0': [0.0],
 'decision_function_shape': ['ovr'],
 'degree': [3],
 'gamma': range(1, 100),
 'kernel': ['rbf', 'linear'],
 'max_iter': [-1],
 'probability': [False],
 'random_state': [None],
 'shrinking': [True],
 'tol': [0.001],
 'verbose': [False]}

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = GridSearchCV(svm_clf, tuned_params, cv=cv, n_jobs=-1)
clf.fit(X_train, y_train)
best_params = clf.best_params_

In [16]:
best_params

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 1,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
best_model = svm.SVC(**best_params)    # задаем найденные наилучшие параметры
best_scores = cross_val_score(best_model, X_test, y_test, cv = cv, scoring="roc_auc")
best_scores.mean()

0.8243083900226758

In [18]:
best_model.fit(X_train, y_train)                    # обучаем модель
predicted = best_model.predict(X_test)              # делаем предсказание
print('Used params:', best_params) #выведем наилучшие параметры
print('Evaluation:\n', metrics.classification_report(y_test, predicted))
#лучшие значения метрик (те их значения, которые получаются при наилучших параметрах модели)

Used params: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Evaluation:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [24]:
params = {'C': [6,7,8,9,10,11,12],
          'kernel': ['linear','rbf']}

model_svm = GridSearchCV(svm_clf, param_grid=params, n_jobs=-1)
model_svm.fit(X_train,y_train)

print("Best Params:\n",model_svm.best_params_)

prediction=model_svm.predict(X_test)

print("Report:\n",metrics.classification_report(prediction,y_test))

Best Params:
 {'C': 6, 'kernel': 'rbf'}
Report:
               precision    recall  f1-score   support

           0       0.91      0.81      0.86       119
           1       0.69      0.85      0.76        60

    accuracy                           0.82       179
   macro avg       0.80      0.83      0.81       179
weighted avg       0.84      0.82      0.82       179



In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(prediction, y_test)


array([[96, 23],
       [ 9, 51]])