In [2]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [5]:
# Create a function to wrangle data 

categorical = ["PULocationID", "DOLocationID"]

def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    # Compute trip durations in minutes
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Define categorical variables 
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    # Remove outliers for durations below 1 and above 60 minutes
    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    return df

In [3]:
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009173 entries, 0 to 3066765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           object        
 8   DOLocationID           object        
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [5]:
# turn dictionary into vector
dv = DictVectorizer()
train_dicts = df_train[categorical].to_dict(orient='records')

# Feature matrix
X_train = dv.fit_transform(train_dicts)

# Target matrix
target = 'duration'
y_train = df_train[target].values

In [6]:
### Build Model

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

MSE = mean_squared_error(y_train, y_pred, squared=False)
MSE

7.649262236295703

In [7]:
# Save model
with open('model.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [3]:
# Load saved model
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [9]:
### Model Evaluation

df_val = read_dataframe('data/yellow_tripdata_2023-03.parquet')

val_dicts = df_val[categorical].to_dict(orient='records')

# Feature matrix
X_val = dv.transform(val_dicts)

# Target matrix
target = 'duration'
y_val = df_val[target].values

y_pred = model.predict(X_val)

In [10]:
y_pred.std()

6.247499281793546

In [6]:
### Model Evaluation

df_test = read_dataframe('data/yellow_tripdata_2023-04.parquet')

dicts = df_test[categorical].to_dict(orient='records')

# Feature matrix
X = dv.transform(dicts)

# Target matrix
target = 'duration'
y = df_test[target].values

y_pred = model.predict(X)

y_pred.mean()

14.292283387821614