In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv")
df = df.set_index("PassengerId")


df["Sex"] = (df["Sex"] == "male").astype(int)
y = df["Survived"]

valid_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df = df[valid_columns].copy()

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


class TitanicPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
        self.num_cols = ["Fare", "Age"]
        
    def fit(self, X, y=None):
        self.si_cat = SimpleImputer(strategy="most_frequent")
        self.si_num = SimpleImputer(strategy="mean")
        self.si_cat.fit(X[self.cat_cols])
        self.si_num.fit(X[self.num_cols])
        return self
        
    def transform(self, X):
        X = X.copy()
        X[self.cat_cols] = self.si_cat.transform(X[self.cat_cols])
        X[self.num_cols] = self.si_num.transform(X[self.num_cols])

        X["Embarked_C"] = (X["Embarked"] == "C").astype(int)
        X["Embarked_Q"] = (X["Embarked"] == "Q").astype(int)
        X = X.drop("Embarked", axis=1)
        return X


def create_pipeline(model, X, y):
    pipeline = Pipeline([
        ('preprocessor', TitanicPreprocessor()),
        ('classifier', model)
    ]).fit(X, y)
    return pipeline

In [3]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=200, criterion="log_loss", max_depth=4)
pipeline = create_pipeline(model, df, y)
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor', TitanicPreprocessor()),
  ('classifier',
   RandomForestClassifier(criterion='log_loss', max_depth=4, n_estimators=200))],
 'transform_input': None,
 'verbose': False,
 'preprocessor': TitanicPreprocessor(),
 'classifier': RandomForestClassifier(criterion='log_loss', max_depth=4, n_estimators=200),
 'classifier__bootstrap': True,
 'classifier__ccp_alpha': 0.0,
 'classifier__class_weight': None,
 'classifier__criterion': 'log_loss',
 'classifier__max_depth': 4,
 'classifier__max_features': 'sqrt',
 'classifier__max_leaf_nodes': None,
 'classifier__max_samples': None,
 'classifier__min_impurity_decrease': 0.0,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 2,
 'classifier__min_weight_fraction_leaf': 0.0,
 'classifier__monotonic_cst': None,
 'classifier__n_estimators': 200,
 'classifier__n_jobs': None,
 'classifier__oob_score': False,
 'classifier__random_state': None,
 'classifier__verbose': 0,
 'classifier__warm_start':

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


y_pred = pipeline.predict(df)
y_prob = pipeline.predict_proba(df)[:, 1]
metrics = {
    "accuracy":  accuracy_score(y, y_pred),
    "precision": precision_score(y, y_pred),
    "recall":    recall_score(y, y_pred),
    "f1_score":  f1_score(y, y_pred),
    "auc_roc":   roc_auc_score(y, y_prob)
}

metrics

{'accuracy': 0.8383838383838383,
 'precision': 0.8721804511278195,
 'recall': 0.6783625730994152,
 'f1_score': 0.7631578947368421,
 'auc_roc': 0.888715793734488}

In [7]:
from dotenv import load_dotenv

load_dotenv("titanic/.env")

import os
os.getenv("MLFLOW_TRACKING_URI")

'http://127.0.0.1:5000'

In [8]:
import mlflow

MLFLOW_EXP = "titanic-model-RF"
try:
    mlflow.create_experiment(name=MLFLOW_EXP)
except:
    pass
mlflow.set_experiment(MLFLOW_EXP)

<Experiment: artifact_location='mlflow-artifacts:/770293554194884980', creation_time=1763731433395, experiment_id='770293554194884980', last_update_time=1763731433395, lifecycle_stage='active', name='titanic-model-RF', tags={}>

In [10]:
from mlflow.models import infer_signature


run_description = """Titanic Random Forest Pipeline"""
with mlflow.start_run(run_name="titanic-pipe-run", description=run_description):  # run_name можно тоже поменять
    signature = infer_signature(df, y_pred)
    mlflow.sklearn.log_model(
        pipeline, "model",
        signature=signature,
        input_example=df.sample(),
    )

    mlflow.log_params(pipeline.get_params())
    mlflow.log_metrics(metrics)

  inputs = _infer_schema(model_input)


In [11]:
loaded_model = mlflow.sklearn.load_model("models:/titanic-RF/Production")
loaded_model.predict(df)

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,