In [None]:
import os
import sys
# import argparse
# import pickle
from pathlib import Path
import mlflow
import pandas as pd
from airflow.decorators import dag
from datetime import datetime

# Add the parent directory to the Python path so we can import the modules
# sys.path.append(str(Path(__file__).parent.parent.parent))
# current_dir = Path(os.getcwd())
# parent_dir = current_dir.parent.parent.parent
sys.path.append("/src/main/python")

from titanic_preprocessing import TitanicPreprocessing
from titanic_training import TitanicTraining
from titanic_evaluation import TitanicEvaluation

In [45]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [46]:
train_path = "src/data/train.csv"
test_path = "src/data/test.csv"

In [50]:
@dag(
    dag_id="titanic",
    default_args={
        "owner": "airflow",
        "start_date": datetime(2025, 3, 20),
        "retries": 1,
    },
    schedule_interval='@once',
    catchup=False,
    tags=["titanic"]
)
def titanic_dag():

    # @task
    # def preprocess_task():
    #     preprocess = TitanicPreprocessing(train_data_path=train_path, test_data_path=test_path)
    #     train_df, test_df = preprocess.preprocess_data()
    #     return train_df, test_df
    preprocess = TitanicPreprocessing(train_data_path=train_path, test_data_path=test_path)
    datasets = preprocess.preprocess_data()
    model_dict = TitanicTraining().train_model(datasets[0])
    evaluate_dict = TitanicEvaluation('src/data/gender_submission.csv').evaluate_model(model_dict, datasets[1])
    
    @task
    def log_mflow(model_dict, evaluate_dict):
        mlflow.set_experiment("test")
        for model_name, eva in evaluate_dict.items():
            with mlflow.start_run():
                mdl = model_dict[model_name]
                mlflow.log_model("model_name", mdl)
                mlflow.log_metric("accuracy", eva)
                print(f"{model_name}: {eva}")

    logging = log_mflow(model_dict, evaluate_dict)
    (
        preprocess
        >> datasets
        >> model_dict
        >> evaluate_dict
        >> logging
    )

In [51]:
titanic_dag()

TypeError: unsupported operand type(s) for >>: 'TitanicPreprocessing' and 'tuple'

In [22]:
preprocess = TitanicPreprocessing(train_data_path=train_path, test_data_path=test_path)
train_df, test_df = preprocess.preprocess_data()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing '

In [34]:
model_dict = TitanicTraining().train_model(train_df)
evaluate_dict = TitanicEvaluation('src/data/gender_submission.csv').evaluate_model(model_dict, test_df, )

In [None]:
evaluate_dict

{'knn': 0.8827751196172249,
 'decision_tree': 0.8133971291866029,
 'random_forest': 0.8277511961722488,
 'gaussian': 0.8421052631578947,
 'svc': 0.9354066985645934,
 'extra_tree': 0.7990430622009569,
 'extra_trees': 0.8301435406698564,
 'bagging': 0.8301435406698564,
 'ada_boost': 0.9354066985645934,
 'gradient_boosting': 0.8827751196172249}

In [37]:
mlflow.set_experiment("test")
for model_name, eva in evaluate_dict.items():
    with mlflow.start_run():
        # mlflow.log_model("model_name", model_name)
        mlflow.log_metric("accuracy", eva)
        print(f"{model_name}: {eva}")

2025/03/23 09:04:28 INFO mlflow.tracking.fluent: Experiment with name 'test' does not exist. Creating a new experiment.


knn: 0.8827751196172249
🏃 View run glamorous-sloth-667 at: http://127.0.0.1:5000/#/experiments/225442148965954792/runs/b01610e76cfe4c66855c8d6826723c92
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/225442148965954792
decision_tree: 0.8133971291866029
🏃 View run trusting-worm-943 at: http://127.0.0.1:5000/#/experiments/225442148965954792/runs/1a48670fdd7c4cabab4a99a93ebae271
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/225442148965954792
random_forest: 0.8277511961722488
🏃 View run polite-gull-424 at: http://127.0.0.1:5000/#/experiments/225442148965954792/runs/16a1095f9c84455489fb4a62c71c2b53
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/225442148965954792
gaussian: 0.8421052631578947
🏃 View run capricious-sow-903 at: http://127.0.0.1:5000/#/experiments/225442148965954792/runs/553f36276119436588bff07c8f2f71ff
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/225442148965954792
svc: 0.9354066985645934
🏃 View run charming-midge-518 at: http://1