In [2]:
!python -V

Python 3.12.2


In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

from sklearn.metrics import root_mean_squared_error

In [5]:
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
DATA_EXPERIMENT_YEAR = '2023'
DATASET_COLOR = 'yellow'

In [8]:
%pwd

'/Users/maksim.fediushkin/PyPetProjects/mlops_zoomcamp/03_orchestration'

In [9]:
df = pd.read_parquet(f"../data/input/{DATASET_COLOR}_tripdata_{DATA_EXPERIMENT_YEAR}-03.parquet")

In [10]:
df.shape

(3403766, 19)

In [11]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


In [12]:
df = read_dataframe(f'../data/input/{DATASET_COLOR}_tripdata_{DATA_EXPERIMENT_YEAR}-03.parquet')

In [13]:
df.shape

(3316216, 20)

In [14]:

EXPERIMENT_NAME = "LinearRegression"

mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment(EXPERIMENT_NAME)

2025/06/07 19:42:49 INFO mlflow.tracking.fluent: Experiment with name 'LinearRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1749318169426, experiment_id='4', last_update_time=1749318169426, lifecycle_stage='active', name='LinearRegression', tags={}>

In [15]:
def perprocess_and_train(df: pd.DataFrame, df_split: float = 0.2, random_state: int = 0, target: str = 'duration'):
    mlflow.sklearn.autolog()
    df_train, df_val = train_test_split(df, test_size=df_split, random_state=random_state)
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    dv = DictVectorizer()

    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    y_train = df_train[target].values
    y_val = df_val[target].values
    with mlflow.start_run():
        lr = LinearRegression()
        lr.fit(X_train, y_train)

        y_pred = lr.predict(X_val)

        loss = root_mean_squared_error(y_val, y_pred)

    print(f"rmse: {loss}")
    print(f"Model intercept: {lr.intercept_}")
    return dv, lr

In [32]:
with open(f'../models/lin_reg_{DATA_EXPERIMENT_YEAR}.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [16]:
vectoriser, model = perprocess_and_train(df=df)


🏃 View run painted-bee-586 at: http://127.0.0.1:8080/#/experiments/4/runs/75b35186fa3b4db6ab84aef3d2706861
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/4
rmse: 8.186053757935595
Model intercept: 23.349372161763142


In [17]:
client = MlflowClient()

experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["metrics.test_rmse ASC"])[0]

mlflow.register_model("runs:/"+best_run.info.run_id+"/model", "sk-learn-Linear-Regression")


Successfully registered model 'sk-learn-Linear-Regression'.
2025/06/07 19:44:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-Linear-Regression, version 1
Created version '1' of model 'sk-learn-Linear-Regression'.


<ModelVersion: aliases=[], creation_timestamp=1749318241916, current_stage='None', description='', last_updated_timestamp=1749318241916, name='sk-learn-Linear-Regression', run_id='75b35186fa3b4db6ab84aef3d2706861', run_link='', source='mlflow-artifacts:/4/75b35186fa3b4db6ab84aef3d2706861/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>