In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# для разделения df на train и test выборку
from sklearn.model_selection import train_test_split
# для подготовки данных
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# перекрестная проверка модели
from sklearn.model_selection import cross_val_score
# # модели
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# from sklearn.model_selection import validation_curve # для исследования модели
from sklearn.model_selection import StratifiedKFold # для стратифицированной выборки
# from sklearn.model_selection import RandomizedSearchCV #импортирует случайный поиск гиперпараметров
from sklearn.model_selection import GridSearchCV # полный перебор гиперпараметров
# хуй
from sklearn import metrics # импортируем метрики

In [2]:
df = pd.read_csv('datasets/Titanic.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
X = df.drop("Survived", axis = 1).drop("Cabin", axis = 1).drop("Name", axis = 1).drop("Ticket", axis=1)
y = df["Survived"].copy()
X = X.set_index("PassengerId")

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 55.7+ KB


In [7]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]
# обработка данных перед обучением

# вставка отсутсвующих значений и маштабирование
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])
X_prepared = preprocess_pipeline.fit_transform(X[num_attribs + cat_attribs])



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)

In [9]:
#Сразу зададим начальные параметры модели,
#после чего можем искать их оптимальные значения методами Grid Search или Random Search
KNN_clf = KNeighborsClassifier(
    n_neighbors=10,
    weights='distance',
    algorithm='auto',
    leaf_size=30,
    metric='euclidean',
    metric_params=None,
    n_jobs=-1
)
KNN_clf.fit(X_train, y_train) # обучаем модель при начальных значениях парметров

In [10]:
KNN_clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [19]:
model_params = KNN_clf.get_params() #зададим параметры по которым и будем осуществлять поиск
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['n_neighbors'] = range(1, 50)
clf = GridSearchCV(KNeighborsClassifier(), tuned_params, cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
best_params = clf.best_params_

In [20]:
KNN_clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 10,
 'p': 2,
 'weights': 'distance'}

In [21]:
clf.best_params_

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 22,
 'p': 2,
 'weights': 'distance'}

In [22]:
tuned_params

{'algorithm': ['auto'],
 'leaf_size': [30],
 'metric': ['euclidean'],
 'metric_params': [None],
 'n_jobs': [-1],
 'n_neighbors': range(1, 50),
 'p': [2],
 'weights': ['distance']}

In [26]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_model = KNeighborsClassifier(**best_params)    # задаем найденные наилучшие параметры
best_scores = cross_val_score(best_model, X_test, y_test, cv = cv, scoring="roc_auc")
best_scores.mean()

0.8434013605442177

In [27]:
best_model.fit(X_train, y_train)                    # обучаем модель
predicted = best_model.predict(X_test)              # делаем предсказание
print('Used params:', best_params) #выведем наилучшие параметры
print('Evaluation:\n', metrics.classification_report(y_test, predicted))
#лучшие значения метрик (те их значения, которые получаются при наилучших параметрах модели)

Used params: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 22, 'p': 2, 'weights': 'distance'}
Evaluation:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       105
           1       0.75      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

