In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
TRAIN_FILE_PATH = "/content/drive/MyDrive/MLOps/ytd/yellow_tripdata_2022-01.parquet"
TEST_FILE_PATH = "/content/drive/MyDrive/MLOps/ytd/yellow_tripdata_2022-02.parquet"

LOWER_BOUNDARY = 1
UPPER_BOUNDARY = 60
CATEGORICAL_VAL = ['PULocationID', 'DOLocationID']
TARGET_VAL = 'duration'

In [4]:
def read_parquet(file_path, print_status=False):
    """ Q1. Downloading the data """

    df = pd.read_parquet(file_path)
    if print_status:
        print(f"Q1. Number of columns: {len(df.columns)}.")

    return df


def compute_duration(df, print_status):
    """ Q2. Computing duration """

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    if print_status:
        print(f"Q2. The standard deviation of the trip duration in Junuary is {df['duration'].std().round(2)}.")

    return df


def drop_outliers(df, lower_limit, higher_limit, print_status):
    """ Q3. Dropping outliers """
    prev_num_of_records = len(df)
    df = df[(df.duration >= lower_limit) & (df.duration <= higher_limit)]
    if print_status:
        fraction_of_left_records = len(df) / prev_num_of_records
        print(f"Q3. The fraction of records left: {round(fraction_of_left_records, 2)*100}.")

    return df


def read_dataframe(file_path, categorical, lower_boundary, upper_boundary, print_status=False):
    df = read_parquet(file_path, print_status=print_status)
    df = compute_duration(df, print_status=print_status)
    df = drop_outliers(df, lower_boundary, upper_boundary, print_status)
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
def train_model(df, categorical, target, dv, lr, print_status=False):
    # Q4. One-hot encoding
    train_dict = df[categorical].to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    if print_status:
        print(f"Q4. The dimensionality of this matrix is {X_train.shape[1]}.")
        
    # Q5. Training a model
    y_train = df[target].values
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_train)
    if print_status:
        print(f"Q5. RMSE on train data: {round(mean_squared_error(y_train, y_pred, squared=False), 2)}")

In [6]:
def predict(df, categorical, target, dv, lr, print_status=False):
    """ Q6. Evaluating the model """
    
    val_dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    y_val = df[target].values
    y_pred = lr.predict(X_val)
    if print_status:
        print(f"Q6. RMSE on validation data: {round(mean_squared_error(y_val, y_pred, squared=False), 2)}")

In [7]:
# Read data
train_df = read_dataframe(TRAIN_FILE_PATH, CATEGORICAL_VAL, LOWER_BOUNDARY, UPPER_BOUNDARY, print_status=True)
val_df = read_dataframe(TEST_FILE_PATH, CATEGORICAL_VAL, LOWER_BOUNDARY, UPPER_BOUNDARY, print_status=False)

Q1. Number of columns: 19.
Q2. The standard deviation of the trip duration in Junuary is 46.45.
Q3. The fraction of records left: 98.0.


In [8]:
# Train model
dv = DictVectorizer()
lr = LinearRegression()
train_model(train_df, CATEGORICAL_VAL, TARGET_VAL, dv, lr, print_status=True)

Q4. The dimensionality of this matrix is 515.
Q5. RMSE on train data: 6.99


In [9]:
# Evaluate model
predict(val_df, CATEGORICAL_VAL, TARGET_VAL, dv, lr, print_status=True)

Q6. RMSE on validation data: 7.79
