In [9]:
from train import load_yaml, preprocess, split_data, train_model, model_predict, evaluate_model

import numpy as np
import pandas as pd
import mlflow
import mlflow.data
from mlflow.models.signature import infer_signature

In [2]:
data = pd.read_csv("./data/train.csv", index_col=0)
data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
parameters = load_yaml("./parameters.yml")
parameters

{'mlflow': {'TRACKING_URI': 'http://192.168.1.3:5000',
  'EXPERIMENT_NAME': 'kaggle_titanic',
  'PROJECT_URI': 'http://192.168.1.3:1080/mlteam/ml-project',
  'VERSION': 'dc89a85445ebbbfc3398e1514c218cc67d4d5f6b'},
 'test_size': 0.2,
 'random_state': 42,
 'model_name': 'xgb',
 'model': {'n_estimators': 100, 'max_depth': 6, 'eta': 0.3, 'random_state': 1},
 'param_grid': {'kernel': ['linear', 'rbf'],
  'C': [0.1, 1, 10, 100],
  'gamma': [0.001, 0.01, 0.1, 1]},
 'features': ['Pclass', 'Sex', 'SibSp', 'Parch'],
 'target': 'Survived'}

In [4]:
df_prep = preprocess(data, parameters)

In [11]:
X_train, X_test, y_train, y_test = split_data(df_prep, parameters)
X_train.head()

Unnamed: 0_level_0,Pclass,SibSp,Parch,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
332,1,0,0,False,True
734,2,0,0,False,True
383,3,0,0,False,True
705,3,1,0,False,True
814,3,4,2,True,False


In [12]:
clf = train_model(X_train, y_train, parameters)
y_pred = model_predict(clf, X_test, parameters)
y_pred

PassengerId
710    0
440    0
841    0
721    1
40     0
      ..
434    0
774    0
26     0
85     1
11     1
Name: Survived, Length: 179, dtype: int64

In [13]:
metrics = evaluate_model(y_pred, y_test)
metrics

{'accuracy': 0.7821229050279329,
 'mae': 0.21787709497206703,
 'mse': 0.21787709497206703,
 'rmse': 0.46677306581685607,
 'r2': 0.10154440154440147}

In [None]:
project_url = f"{parameters['mlflow']['PROJECT_URI']}/-/tree/{parameters['mlflow']['VERSION']}"

with mlflow.start_run() as run:
    run_id = run.info.run_id
    dataset = mlflow.data.from_pandas(df_prep, targets=parameters["target"])
    mlflow.log_input(dataset, context="training")
    
    # Log the parameters used for the model fit
    mlflow.log_params(parameters["model"])
    
    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)
    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(clf, parameters["model_name"], signature=signature)
    mlflow.set_tag(key='Source URL', value=project_url)
    mlflow.set_tag(key='mlflow.note.content', value=project_url)