Homework from 01-Introduction: [link](https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/01-intro/homework.md)

# Imports

In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

# Helper Functions

In [2]:
def create_trip_duration_column(df):
    # trip duration
    df["trip_duration"] = df.dropOff_datetime - df.pickup_datetime
    # convert trip_duration to minutes
    df["trip_duration"] = df["trip_duration"].apply(lambda x: x.total_seconds()/60)

    return df

def create_pickup_plus_dropoff_id(df):
    df["PU_DO_ID"] = df["PUlocationID"].astype(str) + "_" + df["DOlocationID"].astype(str)

    return df

def filter_duration(df):
    print(f"""
        Number of records to drop: 
        {len(df) - 
        len(
            df[(df.trip_duration >= 1) & (df.trip_duration <= 60)]
        )}""")
    df = df[(df.trip_duration >= 1) & (df.trip_duration <= 60)]

    return df

# Load Data

In [3]:
df_train = pd.read_parquet("../data/fhv_tripdata_2021-01.parquet")
df_val = pd.read_parquet("../data/fhv_tripdata_2021-02.parquet")

# Q1

In [4]:
print(f"January data - Number of records: {len(df_train)}")

January data - Number of records: 1154112


# Feature Engineering

In [5]:
df_train = create_trip_duration_column(df_train)
df_val = create_trip_duration_column(df_val)

df_train = create_pickup_plus_dropoff_id(df_train)
df_val = create_pickup_plus_dropoff_id(df_val)

# Q2

In [6]:
print(f"January data - Average trip duration: {df_train.trip_duration.mean()}")

January data - Average trip duration: 19.167224093791006


# Univariate Analysis

In [11]:
# distribution of trip_duration
sns.histplot(df_train, x = "trip_duration");

KeyboardInterrupt: 

In [7]:
df_train.trip_duration.describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: trip_duration, dtype: float64

In [8]:
# due to some strange trip durations and to filter some data, lets filter based on trip_duration
df_train = filter_duration(df_train)
df_val = filter_duration(df_val)


        Number of records to drop: 
        44286

        Number of records to drop: 
        47579


# Select Features

In [15]:
# select only some features for the example
categorical_features = ["PUlocationID", "DOlocationID"]
numerical_features = ["trip_distance"]

# Q3

In [17]:
df_train[categorical_features] = df_train[categorical_features].fillna(-1)

In [20]:
print(f"""
January data - % of missing values for pickup location ID: 
{len(df_train[df_train.PUlocationID == -1]) / len(df_train)}
""")


January data - % of missing values for pickup location ID: 
0.8352732770722617



In [19]:
len(df_train)

1109826

# Prep Features

In [22]:
# tranform the categorical features with one hot encoding
# first we transform the categorial features in str type
df_train[categorical_features] = df_train[categorical_features].astype(str)
# then we transform the df into a matrix using DictVectorizer, 
# which does the OHE with cat features
train_dicts = df_train[categorical_features].to_dict(orient="records")
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical_features].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [23]:
y_train = df_train.trip_duration.values
y_val = df_val.trip_duration.values

# Q4

In [24]:
X_train.shape

(1109826, 525)

# Train Model

## Linear Regression

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

# Q5 and Q6

In [26]:
print(f"Training RMSE: {mean_squared_error(y_train, y_pred, squared = False)}")

Training RMSE: 10.528519388409808


In [27]:
y_pred_val = lr.predict(X_val)
print(f"Validation RMSE: {mean_squared_error(y_val, y_pred_val, squared = False)}")

Validation RMSE: 12.853397099552826
