In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error as mse

# Task 1

In [2]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
df.shape

(2463931, 19)

# Task 2

In [3]:
def calculate_duration_minutes(df):
    df["duration"] = (
    df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    )
    df["duration"] = df["duration"].apply(
        lambda x: x.total_seconds()/60
    )
    return df

In [4]:
df = calculate_duration_minutes(df)

In [5]:
df["duration"].std()

46.44530513776499

# Task 3

In [6]:
def drop_outliers_duration(df):
    df.loc[df["duration"]<1,"duration"] = None
    df.loc[df["duration"]>60, "duration"] = None
    df = df.dropna(subset=["duration"])
    return df

In [7]:
drop_outliers_duration(df).shape[0]/df["duration"].shape[0]

0.9827547930522406

# Task 4

In [8]:
df = df.dropna(subset = ["duration"])
df["DOLocationID"] = df["DOLocationID"].astype('str')
df["PULocationID"] = df["DOLocationID"].astype('str')

In [9]:
data_records = df[
    ["DOLocationID", "PULocationID"]
].to_dict(orient='records')

In [10]:
len(data_records)

2421440

In [11]:
dv = DictVectorizer()

In [12]:
X_train = dv.fit_transform(data_records)

In [13]:
X_train.shape

(2421440, 522)

In [14]:
y_train = df["duration"].values

In [15]:
model = LinearRegression()
model.fit(X=X_train, y=y_train)

In [16]:
np.sqrt(mse(y_pred=model.predict(X_train), y_true=y_train))

7.808762189927155

# Task 5

In [17]:
df_val = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')
df_val = calculate_duration_minutes(df_val)
df_val = drop_outliers_duration(df_val)

In [18]:
df_val["DOLocationID"] = df_val["DOLocationID"].astype('str')
df_val["PULocationID"] = df_val["DOLocationID"].astype('str')

In [19]:
data_records_val = df_val[
    ["DOLocationID", "PULocationID"]
].to_dict(orient='records')
y_val = df_val["duration"]

In [20]:
X_val = dv.transform(data_records_val)

In [22]:
np.sqrt(mse(y_pred=model.predict(X_val), y_true=y_val))

8.576722546318786