In [117]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [56]:
df = pd.read_csv("train.csv")

In [58]:
y = df["Survived"] # помещаем колонку Survived в переменную y. Эта переменная наш target. Цель, которую будем предсказывать.
X = df.drop(['Survived'],axis=1) # признаки, на основе которых будем предсказывать.

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

In [70]:
use_cols = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [71]:
X_train_small = X_train[use_cols].copy()
X_test_small  = X_test[use_cols].copy()

In [84]:
age_median = X_train_small["Age"].median() # заполним медианой на train
embarked_mode = X_train_small["Embarked"].mode()[0] # заполним самым частым.

In [83]:
X_train_filled = X_train_small.copy()
X_test_filled = X_test_small.copy()

In [88]:
X_train_filled["Age"] = X_train_filled["Age"].fillna(age_median)
X_test_filled["Age"] = X_test_filled["Age"].fillna(age_median)

In [89]:
X_train_filled["Embarked"] = X_train_filled["Embarked"].fillna(embarked_mode)
X_test_filled["Embarked"] = X_test_filled["Embarked"].fillna(embarked_mode)

In [96]:
X_train_ohe = pd.get_dummies(
    X_train_filled,
    columns=["Pclass", "Sex", "Embarked"], # какие колонки кодируем
    drop_first=False,
    #dtype=int
)

In [98]:
X_test_ohe = pd.get_dummies(
    X_test_filled,
    columns=["Pclass", "Sex", "Embarked"], # какие колонки кодируем
    drop_first=False
)

In [100]:
X_test_ohe = X_test_ohe.reindex(columns=X_train_ohe.columns, fill_value=0)

In [108]:
# 1) Создаём объект модели логистической регрессии
clf = LogisticRegression(max_iter=1000) # max_iter=1000: даём больше итераций, чтобы алгоритм точно успел "сойтись" (найти хорошие веса)

In [109]:
# 2) Обучаем модель: она подбирает внутренние веса/коэффициенты по X_train_ohe и y_train
clf.fit(X_train_ohe, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [121]:
y_pred = clf.predict(X_test_ohe)

In [112]:
# 4) Предсказываем вероятности классов на тесте
# predict_proba возвращает 2 колонки: P(class=0) и P(class=1)
y_proba = clf.predict_proba(X_test_ohe)[:, 1]

In [122]:
print("accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))
print("confusion:\n", confusion_matrix(y_test, y_pred))

accuracy: 0.8044692737430168
precision: 0.7931034482758621
recall: 0.6666666666666666
f1: 0.7244094488188977
confusion:
 [[98 12]
 [23 46]]


In [116]:
for t in [0.3, 0.5, 0.7]:
    y_pred_t = (y_proba >= t).astype(int)
    print("t=", t,
          "precision=", precision_score(y_test, y_pred_t),
          "recall=", recall_score(y_test, y_pred_t))

t= 0.3 precision= 0.6707317073170732 recall= 0.7971014492753623
t= 0.5 precision= 0.7931034482758621 recall= 0.6666666666666666
t= 0.7 precision= 0.8974358974358975 recall= 0.5072463768115942


In [119]:
auc = roc_auc_score(y_test, y_proba)

In [120]:
print("ROC-AUC:", auc)

ROC-AUC: 0.8426877470355731
