In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
features_ = ['PULocationID', 'DOLocationID']
target = 'duration'

def load_preprocess_data(data_path):
    # Read the parquet File
    df_ = pd.read_parquet(data_path)
    # Create the duration variable
    df_['duration'] = df_.tpep_dropoff_datetime - df_.tpep_pickup_datetime
    #convert duration to minutes
    df_.duration = df_.duration.apply(lambda td: td.total_seconds() / 60)
    # Filter the dataframe wrt to duration time limits
    df_ = df_[(df_['duration'] <=60) & (df_['duration'] >=1)]    
    df_[features_] = df_[features_].astype(str)
    # create the X values in dictonary form
    X_ = df_[features_].to_dict(orient='records')
    # create the Y values
    y_ = df_[target].values

    return X_, y_

### Q1. Downloading the data
Read the data for January. How many columns are there?

In [4]:
df = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')
len(df.columns)

19

### Q2. Computing duration
What's the standard deviation of the trips duration in January?

In [5]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
np.round(df['duration'].std(), 2)

46.45

### Q3. Dropping outliers
What fraction of the records left after you dropped the outliers?

In [6]:
len(df[(df['duration'] <=60) & (df['duration'] >=1)])/len(df)

0.9827547930522406

### Q4. One-hot encoding
What's the dimensionality of this matrix (number of columns)?

In [7]:
# Filtering data
df = df[(df['duration'] <=60) & (df['duration'] >=1)]
df[features_] = df[features_].astype(str)
X_dict = df[features_].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(X_dict)
X_train.shape

(2421440, 515)

### Q5. Training a model
What's the RMSE on train?

In [8]:
X_train, y_train = load_preprocess_data('data/yellow_tripdata_2022-01.parquet')

dv = DictVectorizer()
X_train = dv.fit_transform(X_train)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986190933162326

### Q6. Evaluating the model
What's the RMSE on validation?

In [9]:
X_valid, y_valid = load_preprocess_data('data/yellow_tripdata_2022-02.parquet')

y_pred_valid = lr.predict(dv.transform(X_valid))

mean_squared_error(y_valid, y_pred_valid, squared=False)

7.786412831235261