# 01-Intro Homework
NYK: Taxi data: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [156]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

### 1. EDA

In [157]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (2463931, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0


### 2. Calculate derived column 'Duration'

In [158]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
# Find the standard deviation of the "duration"
print(f"Standard div of the 'duration' column: {round(df['duration'].std(),2)}")

Standard div of the 'duration' column: 46.45


### 3. One-hot encoding using DictVectorizer

In [159]:
cat_vars = ["PULocationID", "DOLocationID"]

In [160]:
df[cat_vars].astype('str').dtypes

PULocationID    object
DOLocationID    object
dtype: object

In [161]:
df[cat_vars] = df[cat_vars].astype(str)

In [162]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

## 4. Refactoring: Reusable pipeline

#### 4.1 Read the parquet file

In [163]:
def read_dataframe(filename):
    print(f"Input filename: {filename}")
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    original_shape = df.shape
    print(f"before filtering shape: {original_shape}")
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    modified_shape = df.shape
    print(f"after filtering shape: {modified_shape}")
    
    left_percentage = round(modified_shape[0]*100/original_shape[0],2)
    print(f"Percentage of records left in df after removing outliers: {left_percentage}%")
    
    categorical = ["PULocationID", "DOLocationID"] 
    df[categorical] = df[categorical].astype(str)
    print()
    return df

In [164]:
df_train = read_dataframe('./data/yellow_tripdata_2022-01.parquet')
df_val   = read_dataframe('./data/yellow_tripdata_2022-02.parquet')

print(f"shape of df_train: {df_train.shape}")
print(f"shape of df_val. : {df_val.shape}")

Input filename: ./data/yellow_tripdata_2022-01.parquet
before filtering shape: (2463931, 20)
after filtering shape: (2421440, 20)
Percentage of records left in df after removing outliers: 98.28%

Input filename: ./data/yellow_tripdata_2022-02.parquet
before filtering shape: (2979431, 20)
after filtering shape: (2918187, 20)
Percentage of records left in df after removing outliers: 97.94%

shape of df_train: (2421440, 20)
shape of df_val. : (2918187, 20)


In [165]:
categorical = ["PULocationID", "DOLocationID"] 

In [166]:
train_dicts      = df_train[categorical].to_dict(orient="records")
validation_dicts = df_val[categorical].to_dict(orient="records")

#### 4.2 Configure categorical features and output column

In [167]:
dv = DictVectorizer()
#Model's feature metrics 
X_train = dv.fit_transform(train_dicts)
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [168]:
X_validation = dv.transform(validation_dicts)
X_validation

<2918187x515 sparse matrix of type '<class 'numpy.float64'>'
	with 5836368 stored elements in Compressed Sparse Row format>

In [169]:
# target variable 
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

#### 4.3 Build model

In [170]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

train_RMSE = round(mean_squared_error(y_train, y_pred, squared=False),2)
print(f"Train RMSE: {train_RMSE}")

Train RMSE: 6.99


In [171]:
y_val_pred = lr.predict(X_validation)

validation_RMSE = round(mean_squared_error(y_val, y_val_pred, squared=False),2)
print(f"Validation RMSE: {validation_RMSE}")

Validation RMSE: 7.79
