In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my-new-experiment ")

2025/11/14 12:58:26 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/11/14 12:58:26 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02experiment_tracking/mlruns/1', creation_time=1762560669311, experiment_id='1', last_update_time=1762560669311, lifecycle_stage='active', name='my-new-experiment ', tags={}>

In [3]:
pip install pyarrow


Note: you may need to restart the kernel to use updated packages.


In [4]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)    
    # convert duration to minutes
    df["duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df["duration"] = df["duration"].dt.total_seconds() / 60

    # filter trips
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # define features
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    # convert categorical to string
    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val   = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')


In [9]:
len(df_train) , len(df_val)

(73908, 61921)

In [10]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]

In [11]:
df_train

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.50,...,0.00,0.00,,0.3,6.80,2.0,1.0,0.00,3.933333,43_151
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.00,...,2.81,0.00,,0.3,16.86,1.0,1.0,2.75,8.750000,166_239
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.00,...,1.00,0.00,,0.3,8.30,1.0,1.0,0.00,5.966667,41_42
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.00,...,0.00,0.00,,0.3,9.30,2.0,1.0,0.00,7.083333,168_75
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.50,...,0.96,0.00,,0.3,5.76,1.0,1.0,0.00,2.316667,75_75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,2,2021-01-31 21:38:00,2021-01-31 22:16:00,,,81,90,,17.63,56.23,...,0.00,6.12,,0.3,65.40,,,,38.000000,81_90
76514,2,2021-01-31 22:43:00,2021-01-31 23:21:00,,,35,213,,18.36,46.66,...,12.20,6.12,,0.3,65.28,,,,38.000000,35_213
76515,2,2021-01-31 22:16:00,2021-01-31 22:27:00,,,74,69,,2.50,18.95,...,0.00,0.00,,0.3,22.00,,,,11.000000,74_69
76516,2,2021-01-31 23:10:00,2021-01-31 23:37:00,,,168,215,,14.48,48.87,...,0.00,6.12,,0.3,58.04,,,,27.000000,168_215


In [12]:

categorical = ["PU_DO"]
numerical = ['trip_distance']

dv = DictVectorizer()

    
train_dic =df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dic)


val_dics= df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dics)


In [13]:
target = 'duration'
y_train = df_train[target].values
y_val= df_val[target].values

In [None]:
with mlflow.start_run():

    mlflow.set_tag("developer", "omar")
    # mlflow.log_param("model_type", "linear_regression")


    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(rmse)

    mlflow.log_metric("rmse" , rmse)




15.313541411272809




TypeError: argument of type 'LinearRegression' is not iterable

In [15]:
with open("/workspaces/modelslin_reg.bin" , "wb") as f_out :
    pickle.dump((dv, lr) , f_out)

