# Week-01: Homework

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error

## 1. Downloading and loading the data
We'll use the NYC taxi dataset, we'll use "Yellow Taxi Trip Records".

Downloading the data for January and February 2023.

In [2]:
jan_filepath = "/workspaces/mlops-zoomcamp/data/yellow_tripdata_2023-01.parquet"
jan_df = pd.read_parquet(jan_filepath)

feb_filepath = "/workspaces/mlops-zoomcamp/data/yellow_tripdata_2023-02.parquet"
feb_df = pd.read_parquet(feb_filepath)

In [3]:
print(len(jan_df.columns), jan_df.columns)

19 Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


## 2. Computing duration
Let's compute the duration variable. It should contain the duration of a ride in minutes.
`duration = tpep_dropoff_datetime - tpep_pickup_datetime`

In [4]:
jan_df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [5]:
def calculateDuration(df):
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

    df["duration"] = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

In [6]:
jan_df = calculateDuration(jan_df)
feb_df = calculateDuration(feb_df)

In [7]:
jan_df.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [8]:
feb_df.duration.describe()

count    2.913955e+06
mean     1.601591e+01
std      4.284210e+01
min     -4.361667e+01
25%      7.250000e+00
50%      1.180000e+01
75%      1.876667e+01
max      7.053617e+03
Name: duration, dtype: float64

## 3. Dropping outliers
Next, let's check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the `duration` was between 1 and 60 minutes (inclusive).

In [9]:
def drop_outliers_by_duration(df, minD, maxD):
    print("Fraction left: ",((df.duration >= minD) & (df.duration <= maxD)).mean())
    return df[(df.duration >= minD) & (df.duration <= maxD)]

In [10]:
jan_df = drop_outliers_by_duration(jan_df, 1, 60)
feb_df = drop_outliers_by_duration(feb_df, 1, 60)

Fraction left:  0.9812202822125979
Fraction left:  0.9800944077722545


## 4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it

In [11]:
def transform_locationID_toStr(df):
    df["PULocationID"] = df["PULocationID"].astype(str)
    df["DOLocationID"] = df["DOLocationID"].astype(str)
    return df

In [12]:
jan_df = transform_locationID_toStr(jan_df)
feb_df = transform_locationID_toStr(feb_df)

In [13]:
categorical = ["PULocationID", "DOLocationID"]
numerical = []
target = "duration"
def train_val_data(train_df, val_df):
    train_dicts = train_df[categorical + numerical].to_dict(orient="records")
    val_dicts = val_df[categorical+numerical].to_dict(orient="records")
    
    dv = DictVectorizer()
    
    x_train = dv.fit_transform(train_dicts)
    print("Dimensionality of feature matrix:", x_train.shape)
    x_val = dv.transform(val_dicts)

    y_train = train_df[target].values
    y_val = val_df[target].values
    return x_train, y_train, x_val, y_val

In [14]:
x_train, y_train, x_val, y_val = train_val_data(train_df=jan_df, val_df=feb_df)

Dimensionality of feature matrix: (3009173, 515)


## 5. Training a model
Let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

In [16]:
model = LinearRegression()
model.fit(x_train,y_train)

In [17]:
y_pred_on_train = model.predict(x_train)
print("RMSE on train: ",root_mean_squared_error(y_pred=y_pred_on_train, y_true=y_train))

RMSE on train:  7.649261932106969


In [18]:
y_pred_on_val = model.predict(x_val)
print("RMSE on validation: ",root_mean_squared_error(y_pred=y_pred_on_val, y_true=y_val))

RMSE on validation:  7.811818743246608


## 6. Saving the model
Finally, let's use pickle and store our trained model

In [19]:
import pickle

with open("/workspaces/mlops-zoomcamp/models/lin_reg.bin", 'wb') as f_out:
    pickle.dump(model, f_out)