In [4]:
!python -V

Python 3.10.16


In [5]:
import pandas as pd
import mlflow

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    print(f"Number of records loaded: {len(df)}")
    print(f"Dataset shape: {df.shape}")

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    # Print the result for Question 4
    print(f"After data preparation - Number of records: {len(df)}")
    print(f"After data preparation - Dataset shape: {df.shape}")

    return df

In [8]:
def train_model(df_train):
    categorical = ['PULocationID', 'DOLocationID']
    numerical = []

    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer()
    X_train = dv.fit_transform(dicts)

    target = 'duration'
    y_train = df_train[target].values

    lr = LinearRegression()
    lr.fit(X_train,y_train)

    print(f"Model intercept: {lr.intercept_:.2f}")

    return dv, lr

In [9]:
def log_model_to_mlflow(dv, lr):

    mlflow.set_tracking_uri("file:./mlruns")

    with mlflow.start_run():
        mlflow.sklearn.log_model(
            sk_model=lr,
            artifact_path="model",
            registered_model_name="taxi-duration-model"
        )
        
        mlflow.log_param("model_type", "LinearRegression")
        mlflow.log_param("features", "PULocationID, DOLocationID")
        mlflow.log_metric("intercept", lr.intercept_)

        run_info = mlflow.active_run().info
        run_id = run_info.run_id

    print(f"Model logged with run_id: {run_id}")

    return run_id

In [10]:
df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
dv, lr = train_model(df)
run_id = log_model_to_mlflow(dv, lr)

Number of records loaded: 3403766
Dataset shape: (3403766, 19)
After data preparation - Number of records: 3316216
After data preparation - Dataset shape: (3316216, 21)
Model intercept: 24.77




Model logged with run_id: 577e793d467b4edd948bb47ad0d25400


Successfully registered model 'taxi-duration-model'.
Created version '1' of model 'taxi-duration-model'.
