In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv")
df = df.set_index("PassengerId")
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
df.isna().sum() / df.shape[0] * 100

Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [4]:
y = df["Survived"]
df = df.drop(["Name", "Ticket", "Cabin", "Survived"], axis=1)

In [5]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [8]:
cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
num_cols = ["Fare", "Age"]

for col in cat_cols:
    print(f"{col}:")
    print(df_train[col].unique())
    print(df_test[col].unique())
    print("-------------------------------")

Pclass:
[1 2 3]
[3 2 1]
-------------------------------
Sex:
['male' 'female']
['male' 'female']
-------------------------------
SibSp:
[0 1 4 3 2 8 5]
[1 0 2 3 4]
-------------------------------
Parch:
[0 2 1 6 4 3 5]
[1 0 2 3 4 5]
-------------------------------
Embarked:
['S' 'C' 'Q' nan]
['C' 'S' 'Q']
-------------------------------


In [10]:
from sklearn.impute import SimpleImputer

si_num = SimpleImputer(strategy="mean").fit(df_train[num_cols])
si_cat = SimpleImputer(strategy="most_frequent").fit(df_train[cat_cols])

In [13]:
si_num.statistics_

array([32.58627612, 29.49884615])

In [14]:
si_cat.statistics_

array([3, 'male', 0, 0, 'S'], dtype=object)

In [15]:
df_train[cat_cols] = si_cat.transform(df_train[cat_cols])
df_train[num_cols] = si_num.transform(df_train[num_cols])
df_test[cat_cols] = si_cat.transform(df_test[cat_cols])
df_test[num_cols] = si_num.transform(df_test[num_cols])

In [22]:
df_train["Sex"] = (df_train["Sex"] == "male").astype(int)
df_test["Sex"] = (df_test["Sex"] == "male").astype(int)

df_train["Sex"]

PassengerId
332    1
734    1
383    1
705    1
814    0
      ..
107    0
271    1
861    1
436    0
103    1
Name: Sex, Length: 712, dtype: int64

In [27]:
df_train[["Pclass", "SibSp", "Parch", "Age"]] = df_train[["Pclass", "SibSp", "Parch", "Age"]].astype(int)
df_test[["Pclass", "SibSp", "Parch", "Age"]] = df_test[["Pclass", "SibSp", "Parch", "Age"]].astype(int)
df_train.dtypes

Pclass        int64
Sex           int64
Age           int64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [28]:
def one_hot_embarked(df):
    oh = pd.get_dummies(df["Embarked"], prefix="Embarked", dtype=int)
    oh = oh.drop("Embarked_S", axis=1)
    df_new = pd.concat([
        df.drop("Embarked", axis=1),
        oh.set_index(df.index)
    ], axis=1)
    return df_new

df_train = one_hot_embarked(df_train)
df_test = one_hot_embarked(df_test)

df_train

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
332,1,1,45,0,0,28.5000,0,0
734,2,1,23,0,0,13.0000,0,0
383,3,1,32,0,0,7.9250,0,0
705,3,1,26,1,0,7.8542,0,0
814,3,0,6,4,2,31.2750,0,0
...,...,...,...,...,...,...,...,...
107,3,0,21,0,0,7.6500,0,0
271,1,1,29,0,0,31.0000,0,0
861,3,1,41,2,0,14.1083,0,0
436,1,0,14,1,2,120.0000,0,0


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


model = LogisticRegression(max_iter=1000).fit(df_train, y_train)
y_pred = model.predict(df_test)


metrics = {
    "accuracy":  accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall":    recall_score(y_test, y_pred),
    "f1_score":  f1_score(y_test, y_pred),
}

metrics

{'accuracy': 0.8100558659217877,
 'precision': 0.7857142857142857,
 'recall': 0.7432432432432432,
 'f1_score': 0.7638888888888888}

In [34]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

MLFLOW_EXP = "titanic-exp"
try:
    mlflow.create_experiment(name=MLFLOW_EXP)
except:
    pass

mlflow.set_experiment(MLFLOW_EXP)

<Experiment: artifact_location='mlflow-artifacts:/339287653448094885', creation_time=1763728412854, experiment_id='339287653448094885', last_update_time=1763728412854, lifecycle_stage='active', name='titanic-exp', tags={}>

In [37]:
from mlflow.models import infer_signature

run_description = "Titanic LogReg"
with mlflow.start_run(run_name="titanic-lr-run", description=run_description):
    mlflow.sklearn.log_model(
        model, "model",
        signature=infer_signature(df_train, y_pred),
        input_example=df_train.sample()
    )

    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())

  inputs = _infer_schema(model_input)


In [41]:
loaded_model = mlflow.sklearn.load_model("models:/titanic/Production")
loaded_model, loaded_model.predict(df_test)[:10]

(LogisticRegression(max_iter=1000), array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1]))

In [39]:
from sklearn.ensemble import RandomForestClassifier


model_rf = RandomForestClassifier(max_depth=3).fit(df_train, y_train)
y_pred_rf = model_rf.predict(df_test)


metrics_rf = {
    "accuracy":  accuracy_score(y_test, y_pred_rf),
    "precision": precision_score(y_test, y_pred_rf),
    "recall":    recall_score(y_test, y_pred_rf),
    "f1_score":  f1_score(y_test, y_pred_rf),
}

metrics_rf

{'accuracy': 0.8044692737430168,
 'precision': 0.8421052631578947,
 'recall': 0.6486486486486487,
 'f1_score': 0.732824427480916}

In [40]:
run_description = "Titanic RandomForest"
with mlflow.start_run(run_name="titanic-rf-run", description=run_description):
    mlflow.sklearn.log_model(
        model_rf, "model",
        signature=infer_signature(df_train, y_pred_rf),
        input_example=df_train.sample()
    )

    mlflow.log_metrics(metrics_rf)
    mlflow.log_params(model_rf.get_params())

  inputs = _infer_schema(model_input)


In [42]:
loaded_model = mlflow.sklearn.load_model("models:/titanic/Production")
loaded_model, loaded_model.predict(df_test)[:10]

(RandomForestClassifier(max_depth=3), array([0, 0, 0, 1, 1, 1, 1, 0, 0, 1]))

In [51]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Проверить все зарегистрированные модели
registered_models = client.search_registered_models()
print("Зарегистрированные модели:")
for model in registered_models:
    print(f" - {model.name}")

Зарегистрированные модели:
 - titanic
 - titanic-RF


---------------------------------------------------------