### Import Modules

In [None]:
import pandas as pd

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

In [None]:
import mlflow

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pickle

### Set Experiment and Tracking URI

In [None]:
mlflow.set_tracking_uri("sqlite:///../mlruns.db")

In [None]:
mlflow.set_experiment("DU-AI")

### Read and Preprocess data

In [None]:
def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    
    return df

In [None]:
train_data_path = '../data/yellow_tripdata_2023-01.parquet'
val_data_path = '../data/yellow_tripdata_2023-02.parquet'

In [None]:
df_train = read_data(train_data_path)
df_val = read_data(val_data_path)

In [None]:
def compute_features(df):
    str_columns = ["PULocationID", "DOLocationID"]
    df[str_columns] = df[str_columns].astype('str')
    df['PU_DO'] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]

    return df

def extract_features_target(df):
    df = compute_features(df)
    
    categorical = ['PU_DO']
    numerical = ['trip_distance']

    return df[categorical + numerical], df.duration.values

In [None]:
X_train, y_train = extract_features_target(df_train)
X_val, y_val = extract_features_target(df_val)

In [None]:
## Fit-Transform vectorizer

# get columns as dict
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# fit transform dict vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [None]:
X_train.shape, X_val.shape

In [None]:
y_train = df_train.duration.values
y_val = df_val.duration.values

In [None]:
y_train.shape, y_val.shape

### Train and evaluate

In [None]:
mlflow.autolog(log_datasets=False)

In [None]:
with mlflow.start_run():
    mlflow.set_tag("test", "test1")
    
    # Define model
    alpha = 0.5
    lr = Ridge(alpha)
    
    # Train model
    lr.fit(X_train, y_train)
    
    # Eval model
    y_train_pred = lr.predict(X_train)
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    
    y_val_pred = lr.predict(X_val)
    val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)

In [None]:
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")

### Tracking with MLFlow

In [None]:
mlflow.autolog()

In [None]:
with mlflow.start_run():
    # Log tag
    mlflow.set_tag("model type", "ridge")
    
    # Log params
    mlflow.log_param("train dp", train_data_path)
    mlflow.log_param("val dp", val_data_path)
    mlflow.log_param("alpha", alpha)

    # Log metrics
    mlflow.log_metric("Train RMSE", train_rmse)
    mlflow.log_metric("Val RMSE", val_rmse)

### Save model and vectorizers as artifact

In [None]:
with open('../models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
with mlflow.start_run():
    # Log tag
    mlflow.set_tag("model type", "ridge")
    mlflow.set_tag("save type", "both as artifact")
    
    # Log params
    mlflow.log_param("train dp", train_data_path)
    mlflow.log_param("val dp", val_data_path)
    mlflow.log_param("alpha", alpha)

    # Log metrics
    mlflow.log_metric("Train RMSE", train_rmse)
    mlflow.log_metric("Val RMSE", val_rmse)

    # Log model and vectorizer as artifacts
    mlflow.log_artifact("../models/lin_reg.bin")

### Save vectorizer only as artifact

In [None]:
from mlflow.models import infer_signature

In [None]:
with open('../models/vectorizer.bin', 'wb') as f_out:
    pickle.dump(dv, f_out)

In [None]:
!ls -l ../models

In [None]:
with mlflow.start_run():
    # Log tag
    mlflow.set_tag("model type", "ridge")
    mlflow.set_tag("save type", "vect only as artifact")
    
    # Log params
    mlflow.log_param("train dp", train_data_path)
    mlflow.log_param("val dp", val_data_path)
    mlflow.log_param("alpha", alpha)

    # Log metrics
    mlflow.log_metric("Train RMSE", train_rmse)
    mlflow.log_metric("Val RMSE", val_rmse)

    # Log vectorizer
    mlflow.log_artifact("../models/vectorizer.bin")
    
    # Log model
    signature = infer_signature(X_train, y_train_pred)
    mlflow.sklearn.log_model(
        lr,
        "models",
        signature=signature,
        input_example=X_train
    )

### Load model

#### Case: model and vectorizer logged as artifact 

In [None]:
run_id = "a19792b92e3e4e54a389d609cd868fc6"
artifact_name = "vect.bin"

In [None]:
artifact_path = mlflow.artifacts.download_artifacts(artifact_uri=f"runs:/{run_id}/{artifact_name}")

In [None]:
artifact_path

In [None]:
with open(artifact_path, 'rb') as f_in:
    loaded_dv, loaded_lr = pickle.load(f_in)

In [None]:
loaded_dv, loaded_lr

#### Case: only vectorizer as artifact

In [None]:
run_id = "a19792b92e3e4e54a389d609cd868fc6"
artifact_name = "vectorizer.bin"

In [None]:
artifact_path = mlflow.artifacts.download_artifacts(artifact_uri=f"runs:/{run_id}/{artifact_name}")

In [None]:
artifact_path

In [None]:
with open(artifact_path, 'rb') as f_in:
    loaded_dv = pickle.load(f_in)

In [None]:
loaded_dv

In [None]:
model_uri = f"runs:/{run_id}/models"
model_uri

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_uri)

In [None]:
loaded_model.get_raw_model()

In [None]:
loaded_model.predict(X_val)

##### What is wrong with this approach ??

In [None]:
test_data_path = '../data/yellow_tripdata_2023-03.parquet'

df_test = read_data(test_data_path)

In [None]:
X_test, y_test = extract_features_target(df_test)

test_dicts = X_test.to_dict(orient='records')
X_test = loaded_dv.transform(test_dicts)

In [None]:
y_pred = loaded_model.predict(X_test)
y_pred

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE {rmse:.2f}")

### Save vectorizer along-side of model

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('model', Ridge(alpha))
])

In [None]:
pipeline

In [None]:
### Prepare data
df_train = read_data(train_data_path)
X_train, y_train = extract_features_target(df_train)

# Prepare data for vectorizer
X_train = X_train.to_dict(orient="records")

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_train)

In [None]:
print(mean_squared_error(y_train, y_pred, squared=False))

In [None]:
with mlflow.start_run():
    # Log tag
    mlflow.set_tag("model type", "ridge")
    mlflow.set_tag("save type", "vect only as artifact")
    
    # Log params
    mlflow.log_param("train dp", train_data_path)
    mlflow.log_param("val dp", val_data_path)
    mlflow.log_param("alpha", alpha)

    # Log metrics
    mlflow.log_metric("Train RMSE", train_rmse)
    mlflow.log_metric("Val RMSE", val_rmse)

    # Log model
    signature = infer_signature(X_train, y_pred)
    mlflow.sklearn.log_model(
        pipeline,
        "models",
        signature=signature,
    )