In [3]:
import pandas as pd
import pickle
from typing import List
from numpy import ndarray
import matplotlib.pyplot as plt

# Exercise Requirements

- Update the dependencies

```
pip install -r requirements.txt

```

- Download the files for Jan and Feb 2022 running these commands:

```
python3 discovery.py --url https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet --prefix yellow
python3 discovery.py --url https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet --prefix yellow
```



##  Downloading the data

Read the data for January 2022. 
How many columns are there?

In [4]:
def read_parquet(file_path : str) -> pd.DataFrame: 
    df = pd.read_parquet(file_path, engine="fastparquet")
    return df

file_jan = "../data/yellow_tripdata_2022-01.parquet"
df_jan = read_parquet(file_jan)    
print(F" Jan 2022 Columns {df_jan.columns.size}")

 Jan 2022 Columns 19


##  Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.
What's the standard deviation of the trips duration in January?

In [5]:

def add_duration(df : pd.DataFrame) -> pd.DataFrame:
     # calculate the duration of each trip in the series
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']  
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)    
    return df

def get_duration_std_dev(df : pd.DataFrame) -> float:    
    # Calculate the standard deviation of trip durations
    std_dev = df['duration'].std()
    return std_dev

#  add the duration series
df_jan = add_duration(df_jan)
std_dev_jan =get_duration_std_dev(df_jan)
print("Standard Deviation of Jan 2022 Trip Durations: {:.2f}%".format(std_dev_jan))

Standard Deviation of Jan 2022 Trip Durations: 46.45%


##  Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [6]:
def drop_outliers(df : pd.DataFrame, min = 1, max = 60) -> pd.DataFrame:
    # filter the DataFrame to keep durations min and max
    df = df[(df['duration'] >= min) & (df['duration'] <= max)]
    return df

def get_percentage_change(df_original : pd.DataFrame, df_filtered : pd.DataFrame) -> float:
    return (len(df_filtered) / len(df_original)) * 100

df_jan_filtered = drop_outliers(df_jan)
# calculate the percentage of rows remaining 
percentage_remaining = get_percentage_change(df_jan, df_jan_filtered)
print("percentage of Rows Remaining: {:.2f}%".format(percentage_remaining))


percentage of Rows Remaining: 98.28%


## One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it
- What's the dimensionality of this matrix (number of columns)?

In [15]:
from sklearn.feature_extraction import DictVectorizer

# fit and transform (gets the feature matrix) the data using the DictVectorizer
# It's important to create a single instance of DictVectorizer and use that instance for both the training and test data.
dv = DictVectorizer()

def get_feature_matrix(dv:DictVectorizer, df : pd.DataFrame, features: List[str], training = True) -> ndarray:
    # apply one-hot encoding to pickup and dropoff location IDs    
    df_features = df[features].astype(str)    

    # convert the one-hot encoded DataFrame to a list of dictionaries
    data_dict = df_features.to_dict(orient='records')
    
    # apply a fit_transoform for training data, else use only transform for test/new data
    X_encoded = dv.fit_transform(data_dict) if training else dv.transform(data_dict)     
    
    # X_encoded is the feauture matrix and is the input to the ML model. 
    # each row of the feature matrix represents an individual observation, 
    # each column represents a specific feature or variable    
    # print("Feature matrix:", X_encoded.toarray())

    return X_encoded

#  get the feature matrix
features = ["PULocationID", "DOLocationID"]
target = ["duration"]

In [16]:

X_train = get_feature_matrix(dv, df_jan_filtered, features)

#  get the target values
y_train = df_jan_filtered["duration"].values

# get the dimensionality of the feature matrix
num_rows, num_cols = X_train.shape

# print("number of rows:", num_rows)
print("number of cols:", num_cols)


number of cols: 515


## Training a model

Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the Root Mean Square Error (RMSE) of the model on the training data

What's the RMSE on train?

In [9]:
import joblib
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def calculate_rmse(model: LinearRegression, X: ndarray, y: List[int]) -> float:
    
    # Make predictions on the test data
    y_pred = model.predict(X)
    
    # Calculate the RMSE
    # rmse = np.sqrt(mean_squared_error(y, y_pred, squared=False))
    rmse = mean_squared_error(y, y_pred, squared=False)

    return rmse

def get_model(model_name: str, X: ndarray, y: List[int] ) -> LinearRegression:

    model_name = F'../models/{model_name}.pkl'
    # model = joblib.load(model_name)
    lr = LinearRegression()
    model = lr.fit(X, y)
    # save the trained model to a file
    joblib.dump(model, model_name)

    return model


model = get_model("yellow_2022_linear_model", X_train, y_train)

rmse_train = calculate_rmse(model, X_train, y_train)
print("RMSE Train of Jan 2022 Trip Durations: {:.2f}%".format(rmse_train))

RMSE Train of Jan 2022 Trip Durations: 6.99%


## Evaluating the model

Now let's apply this model to the validation dataset (February 2022).

What's the RMSE on validation?

In [10]:
file_feb = "../data/yellow_tripdata_2022-02.parquet"
df_feb = read_parquet(file_feb)   
df_feb = add_duration(df_feb)
df_feb_filtered = drop_outliers(df_feb)


In [17]:
#  get the test/evaluation data
X_test = get_feature_matrix(dv, df_feb_filtered, features, False)

#  get the test target values
y_test = df_feb_filtered["duration"].values

# calculate the rmse for Feb 2022
rmse_test = calculate_rmse(model, X_test, y_test)
print("RMSE Test of Feb 2022 Trip Durations: {:.2f}%".format(rmse_test))

RMSE Test of Feb 2022 Trip Durations: 7.79%
