<a href="https://colab.research.google.com/github/nikitazhuikov/ML-projects/blob/main/TitanikSklearnML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [3]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

ModuleNotFoundError: No module named 'mlflow'

In [10]:
from tpot import TPOTClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

ModuleNotFoundError: No module named 'tpot'

In [4]:
titanic_df = pd.read_csv('Titanic-Dataset.csv', usecols=['Pclass', 'Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
titanic_df['Sex'] = pd.factorize(titanic_df['Sex'])[0]
titanic_df['Embarked'] = pd.factorize(titanic_df['Embarked'])[0]
titanic_df = titanic_df.dropna()
numerical_cols = titanic_df[['Age', 'Fare']].columns

In [5]:
# нормализация
titanic_df[numerical_cols] = StandardScaler().fit_transform(titanic_df[numerical_cols])

In [6]:
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,0.306657,0,0,-0.542281,0
1,1,3,1,1.194236,1,0,-0.555844,1
2,0,2,0,2.259331,0,0,-0.511885,0
3,0,3,0,-0.225890,0,0,-0.528651,1
4,1,3,1,-0.580922,1,1,-0.469357,1
...,...,...,...,...,...,...,...,...
409,1,3,1,-1.930042,1,1,-0.445026,1
411,1,1,1,0.484173,1,0,0.801785,0
412,1,3,1,-0.154884,0,0,-0.543168,1
414,1,1,1,0.626186,0,0,1.110932,2


In [7]:
x = titanic_df.drop('Survived', axis = 1)

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5001")
mlflow.set_experiment("log_reg")


<Experiment: artifact_location='mlflow-artifacts:/189883142389238581', creation_time=1743612627305, experiment_id='189883142389238581', last_update_time=1743612627305, lifecycle_stage='active', name='log_reg', tags={}>

In [8]:
y = titanic_df['Survived']

In [9]:
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.3)
# x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5)

In [None]:
with mlflow.start_run(run_name="Log_reg_100"):
    mdl = LogisticRegression(max_iter=100)
    mdl.fit(x_train, y_train)
    y_pred = mdl.predict(x_test)
    TP = ((y_test == 1) & (y_pred == 1)).sum()
    TN = ((y_test == 0) & (y_pred == 0)).sum()
    FP = ((y_test == 0) & (y_pred == 1)).sum()
    FN = ((y_test == 1) & (y_pred == 0)).sum()
    metrics = {
        'accuracy': (TP + TN)/(TP + TN + FP + FN),
        'precision': TP  / (TP + FP),
        'recall': TP / (TP + FP),
        'sensitivity': TP / (TP + FN)
    }
    mlflow.log_metrics(metrics)

🏃 View run Log_reg_100 at: http://127.0.0.1:5001/#/experiments/189883142389238581/runs/c9e8333ee98a4a0d8b75be211e89bd28
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/189883142389238581


In [None]:
accuracy = (TP + TN)/(TP + TN + FP + FN)
precision = TP  / (TP + FP)
recall = TP / (TP + FP)
sensitivity = TP / (TP + FN)

In [None]:
print('accuracy:', accuracy)
print('precision:', precision)
print('recall:', recall)
print('sensitivity:', sensitivity)

accuracy: 0.827906976744186
precision: 0.7209302325581395
recall: 0.7209302325581395
sensitivity: 0.8266666666666667


In [None]:
# строим ROC-AUC
y_pred_proba = mdl.predict_proba(x_test)[:, 1]

In [None]:
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': y_pred_proba
})
comparison_df = comparison_df.sort_values('Probability', ascending=False)
comparison_df

Unnamed: 0,Actual,Predicted,Probability
689,1,1,0.986553
716,1,1,0.966841
307,1,1,0.962675
136,1,1,0.961685
887,1,1,0.960250
...,...,...,...
406,0,0,0.054444
631,0,0,0.054264
59,0,0,0.047969
94,0,0,0.039291


In [None]:
P = comparison_df['Actual'].value_counts()[1]
N = comparison_df['Actual'].value_counts()[0]

In [None]:
y = 0
x = 0
X = [x]
Y = [y]
for a in comparison_df['Actual']:
    if a == 1:
        y += 1/P
    else:
        x += 1/N
    X.append(x)
    Y.append(y)
# Строим график
plt.figure(figsize=(8, 8))
plt.plot(X, Y, 'b-', label='ROC кривая')
plt.plot([0, 1], [0, 1], 'r--', label='Случайный классификатор')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая для модели Титаник')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
tpot = TPOTClassifier(
    generations=5,  # количество поколений для оптимизации
    population_size=20,  # размер популяции в каждом поколении
    cv=5,  # количество фолдов для кросс-валидации
    random_state=42,
    verbosity=2,
    n_jobs=-1  # использовать все доступные ядра процессора
)

# Запускаем MLflow эксперимент для отслеживания результатов
with mlflow.start_run(run_name="TPOT_AutoML"):
    # Обучаем TPOT
    tpot.fit(x_train, y_train)

    # Получаем предсказания
    y_pred_auto = tpot.predict(x_test)
    y_pred_proba_auto = tpot.predict_proba(x_test)[:, 1]

    # Вычисляем метрики
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred_auto),
        'precision': precision_score(y_test, y_pred_auto),
        'recall': recall_score(y_test, y_pred_auto),
        'roc_auc': roc_auc_score(y_test, y_pred_proba_auto)
    }

    # Логируем метрики
    mlflow.log_metrics(metrics)

    # Сохраняем лучший пайплайн
    mlflow.sklearn.log_model(tpot.fitted_pipeline_, "tpot_best_model")

    # Выводим результаты
    print("\nЛучший пайплайн:")
    print(tpot.fitted_pipeline_)
    print("\nМетрики на тестовом наборе:")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value:.3f}")

# Построение ROC-кривой для сравнения моделей
plt.figure(figsize=(10, 8))

# ROC-кривая для базовой модели
fpr_base, tpr_base, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr_base, tpr_base, 'b-', label=f'Базовая модель (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')

# ROC-кривая для AutoML модели
fpr_auto, tpr_auto, _ = roc_curve(y_test, y_pred_proba_auto)
plt.plot(fpr_auto, tpr_auto, 'r-', label=f'AutoML модель (AUC = {metrics["roc_auc"]:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Случайный классификатор')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Сравнение ROC-кривых базовой и AutoML моделей')
plt.legend()
plt.grid(True)
plt.show()

# Сохраняем лучший пайплайн в файл Python
tpot.export('tpot_titanic_pipeline.py')