In [157]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV, validation_curve, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, plot_confusion_matrix, accuracy_score, \
    plot_roc_curve, plot_precision_recall_curve, classification_report, precision_score, recall_score
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [86]:
data_mod = pd.read_csv('mod_data/data_mod.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [254]:
# Функция для записи метрик в таблицу
def get_scores(report_df, model, X_test, y_test, name):

    report = pd.DataFrame(columns={'ROC-AUC'}, data=[0])
    report['ROC-AUC'] = roc_auc_score(y_test,
                                      model.predict_proba(X_test)[:, 1])
    report['F1'] = f1_score(y_test, model.predict(X_test))
    report['precision_0'] = precision_score(
        y_test, model.predict(X_test), pos_label=0)
    report['precision_1'] = precision_score(
        y_test, model.predict(X_test), pos_label=1)
    report['recall_0'] = recall_score(
        y_test, model.predict(X_test), pos_label=0)
    report['recall_1'] = recall_score(
        y_test, model.predict(X_test), pos_label=1)
    report['accuracy'] = accuracy_score(
        y_test, model.predict(X_test))
    report.index = [name]
    report_df = report_df.append(report)
    return report_df
# датафрейм для записи метрик
df_report = pd.DataFrame(columns=['ROC-AUC','F1','precision_0','precision_1','recall_0','recall_1','accuracy'])

# Feature selection

In [246]:
data = data_mod.drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Alone', 'PassengerId'], axis=1)

# Features encoding

In [247]:
features_encoding = ['Pclass','Sex','AgeC','Embarked','Deck','FareC']
data = pd.get_dummies(data=data, columns=features_encoding)

In [248]:
features_encoding  = ['Family']
for i in features_encoding:
    data[i] = LabelEncoder().fit_transform(data[i])

In [249]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 37 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Survived       100000 non-null  float64
 1   Family         200000 non-null  int64  
 2   Pclass_1       200000 non-null  uint8  
 3   Pclass_2       200000 non-null  uint8  
 4   Pclass_3       200000 non-null  uint8  
 5   Sex_female     200000 non-null  uint8  
 6   Sex_male       200000 non-null  uint8  
 7   AgeC_AgeC-0    200000 non-null  uint8  
 8   AgeC_AgeC-1    200000 non-null  uint8  
 9   AgeC_AgeC-2    200000 non-null  uint8  
 10  AgeC_AgeC-3    200000 non-null  uint8  
 11  AgeC_AgeC-4    200000 non-null  uint8  
 12  AgeC_AgeC-5    200000 non-null  uint8  
 13  AgeC_AgeC-6    200000 non-null  uint8  
 14  AgeC_AgeC-7    200000 non-null  uint8  
 15  AgeC_AgeC-8    200000 non-null  uint8  
 16  AgeC_AgeC-9    200000 non-null  uint8  
 17  Embarked_C     200000 non-nul

### Split data

In [250]:
# разделяем общий дадасет data на train и test
# в тестовый датасет попадают только те объекты для которых значение 'Survived' не определено
predict_data = data[data['Survived'].isnull()]
predict_data = predict_data.drop(['Survived'], axis = 1)

# в тренировачный набор попадают объекты у которых нет пропущеных значений ни у одной переменной
train_data = data.dropna()
target = train_data['Survived']
train_data = train_data.drop(['Survived'], axis = 1)
# контроль разммера
train_data.shape

(100000, 36)

# Random forest model

In [251]:
from sklearn.ensemble import RandomForestClassifier

In [252]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target, 
                                                    test_size=0.2, 
                                                    random_state=23)

In [253]:
rf_model = RandomForestClassifier(random_state=23)
rf_model_params = [{
    "n_estimators": [ 420, 430, 440],
    "min_samples_split": [1,2,3],
    "min_samples_leaf": [8,10,12]
}]
rf_model_grid = GridSearchCV(rf_model, rf_model_params, cv=3, refit=True, verbose=1, n_jobs=-1)
rf_model_grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  4.2min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=23), n_jobs=-1,
             param_grid=[{'min_samples_leaf': [8, 10, 12],
                          'min_samples_split': [1, 2, 3],
                          'n_estimators': [420, 430, 440]}],
             verbose=1)

In [240]:
# лучший score и лучшие параметры на кросс-валидации
rf_model_best = rf_model_grid.best_estimator_
print("Accuracy (random forest auto): {} params {}"
      .format(rf_model_grid.best_score_, rf_model_grid.best_params_))

Accuracy (random forest auto): 0.7763125078989465 params {'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 420}


In [255]:
# записываем результаты в датафрейм
df_report = get_scores(df_report, rf_model_grid, X_test,
                       y_test, 'Random_Forest')

In [257]:
df_report

Unnamed: 0,ROC-AUC,F1,precision_0,precision_1,recall_0,recall_1,accuracy
Random_Forest,0.85143,0.740577,0.798866,0.758861,0.827142,0.723154,0.7825


In [221]:
#pred = rf_model_grid.predict_proba(predict_data)

In [108]:
#submission['Survived'] = ((pred) > 0.5).astype(int)
#submission.to_csv('RF_predict.csv', index = False)