#sqlite is used to validate models and github code spaces for the runs

Q. Creating a pipeline and records

In [6]:
import pandas as pd

URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet"
df = pd.read_parquet(URL)

print(f"Number of records: {len(df)}")

Number of records: 3403766


#Question 4. Data preparation

In [7]:
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)

print(f"Filtered records: {len(df)}")

Filtered records: 3316216


#Question 5. Train a model 

In [8]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

categorical = ['PULocationID', 'DOLocationID']
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df['duration'].values

model = LinearRegression()
model.fit(X_train, y_train)

print(f"Model intercept: {round(model.intercept_, 2)}")

Model intercept: 24.77


#Question 6. MLFlow 

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import pickle
import pandas as pd

# Load and prepare data
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet")
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]
df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)

# Features and labels
categorical = ['PULocationID', 'DOLocationID']
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df['duration'].values

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# MLflow tracking setup
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("fit_intercept", model.fit_intercept)
    mlflow.log_param("normalize", False)  # Example of additional parameters

    # Log a metric
    train_score = model.score(X_train, y_train)
    mlflow.log_metric("train_score", train_score)

    # Log the model
    mlflow.sklearn.log_model(model, "model", registered_model_name="lin_reg_model_hw3")

    # Log DictVectorizer as artifact
    with open("dv.pkl", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("dv.pkl")

    print(f"✅ Model logged with train_score: {train_score}, fit_intercept: {model.fit_intercept}")

Registered model 'lin_reg_model_hw3' already exists. Creating a new version of this model...
2025/06/30 08:43:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lin_reg_model_hw3, version 2
Created version '2' of model 'lin_reg_model_hw3'.


✅ Model logged with train_score: 0.40810086281558566, fit_intercept: True
🏃 View run zealous-hog-301 at: http://localhost:5000/#/experiments/1/runs/f78d100da94648e0850985f665259d12
🧪 View experiment at: http://localhost:5000/#/experiments/1
