<a href="https://colab.research.google.com/github/nikitazhuikov/ML-projects/blob/main/TitanikSklearnML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [216]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [217]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [218]:
titanic_df = pd.read_csv('Titanic-Dataset.csv', usecols=['Pclass', 'Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
titanic_df['Sex'] = pd.factorize(titanic_df['Sex'])[0]
titanic_df['Embarked'] = pd.factorize(titanic_df['Embarked'])[0]
titanic_df = titanic_df.dropna()
numerical_cols = titanic_df[['Age', 'Fare']].columns

In [219]:
# нормализация
titanic_df[numerical_cols] = StandardScaler().fit_transform(titanic_df[numerical_cols])

In [220]:
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,-0.530377,1,0,-0.518978,0
1,1,1,1,0.571831,1,0,0.691897,1
2,1,3,1,-0.254825,0,0,-0.506214,0
3,1,1,1,0.365167,1,0,0.348049,0
4,0,3,0,0.365167,0,0,-0.503850,0
...,...,...,...,...,...,...,...,...
885,0,3,1,0.640719,0,5,-0.105320,2
886,0,2,0,-0.185937,0,0,-0.410245,0
887,1,1,1,-0.737041,0,0,-0.088774,0
889,1,1,0,-0.254825,0,0,-0.088774,1


In [221]:
x = titanic_df.drop('Survived', axis = 1)

In [222]:
mlflow.set_tracking_uri("http://127.0.0.1:5001")
mlflow.set_experiment("log_reg")


<Experiment: artifact_location='mlflow-artifacts:/189883142389238581', creation_time=1743612627305, experiment_id='189883142389238581', last_update_time=1743612627305, lifecycle_stage='active', name='log_reg', tags={}>

In [223]:
y = titanic_df['Survived']

In [224]:
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.3)
# x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5)

In [225]:
with mlflow.start_run(run_name="Log_reg_100"):
    mdl = LogisticRegression(max_iter=100)
    mdl.fit(x_train, y_train)
    y_pred = mdl.predict(x_test)
    TP = ((y_test == 1) & (y_pred == 1)).sum()
    TN = ((y_test == 0) & (y_pred == 0)).sum()
    FP = ((y_test == 0) & (y_pred == 1)).sum()
    FN = ((y_test == 1) & (y_pred == 0)).sum()
    metrics = {
        'accuracy': (TP + TN)/(TP + TN + FP + FN),
        'precision': TP  / (TP + FP),
        'recall': TP / (TP + FP),
        'sensitivity': TP / (TP + FN)
    }
    mlflow.log_metrics(metrics)

🏃 View run Log_reg_100 at: http://127.0.0.1:5001/#/experiments/189883142389238581/runs/c9e8333ee98a4a0d8b75be211e89bd28
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/189883142389238581


In [226]:
accuracy = (TP + TN)/(TP + TN + FP + FN)
precision = TP  / (TP + FP)
recall = TP / (TP + FP)
sensitivity = TP / (TP + FN)

In [227]:
print('accuracy:', accuracy)
print('precision:', precision)
print('recall:', recall)
print('sensitivity:', sensitivity)

accuracy: 0.827906976744186
precision: 0.7209302325581395
recall: 0.7209302325581395
sensitivity: 0.8266666666666667


In [241]:
# считаем ROC-AUC
y_pred_proba = mdl.predict_proba(x_test)[:, 1]

In [245]:
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': y_pred_proba
})
comparison_df = comparison_df.sort_values('Probability', ascending=False)
comparison_df

Unnamed: 0,Actual,Predicted,Probability
689,1,1,0.986553
716,1,1,0.966841
307,1,1,0.962675
136,1,1,0.961685
887,1,1,0.960250
...,...,...,...
406,0,0,0.054444
631,0,0,0.054264
59,0,0,0.047969
94,0,0,0.039291


In [249]:
P = comparison_df['Actual'].value_counts()[1]
N = comparison_df['Actual'].value_counts()[0]