In [3]:
# !pip install pandas
# !pip install pyarrow
# !pip install seaborn
# !pip install sklearn

In [4]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

os.chdir('../data')

In [5]:
!ls

fhv_tripdata_2021-01.parquet  fhv_tripdata_2021-02.parquet


In [6]:
def process_df(path: str, dv = None):
    if path.endswith('.csv'):
        df = pd.read_csv(path)
    elif path.endswith('.parquet'):
        df = pd.read_parquet(path)
    
    print(f'Initial shape is {df.shape}')
        
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)
    
    print(f'Average trip duration: {df.duration.mean()}')
    
    num_records_0 = df.shape[0]
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    num_records_1 = df.shape[0]
    print(f'Dropped records = {num_records_1 - num_records_0}')
    print(f'New shape is {df.shape}')
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(-1)
    for col in categorical:
        df[col] = df[col].astype('int32')
        df[col] = df[col].astype('str')
        
    print('The share of missing PUlocationID is', end = ' ')
    print(f'{round(100 * df[df.PUlocationID == "-1"].shape[0]/df.shape[0], 2)}%')
    
    if not dv:
        dv = DictVectorizer()
        X = dv.fit_transform(df[categorical].to_dict(orient='records'))
    else:
        X = dv.transform(df[categorical].to_dict(orient='records'))
    
    print(f'Shape of X is {X.shape}')
    
    y = df.duration.values
    return X, y, dv

In [7]:
X_train, y_train, dv = process_df('fhv_tripdata_2021-01.parquet')

Initial shape is (1154112, 7)
Average trip duration: 19.167224093791006
Dropped records = -44286
New shape is (1109826, 8)
The share of missing PUlocationID is 83.53%
Shape of X is (1109826, 525)


In [8]:
X_val, y_val, _ = process_df('fhv_tripdata_2021-02.parquet', dv)

Initial shape is (1037692, 7)
Average trip duration: 20.706986225199763
Dropped records = -47579
New shape is (990113, 8)
The share of missing PUlocationID is 85.71%
Shape of X is (990113, 525)


In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
preds = model.predict(X_train)
mse = mean_squared_error(y_train, preds)
np.sqrt(mse)

10.528519389548583

In [11]:
preds = model.predict(X_val)
mse = mean_squared_error(y_val, preds)
np.sqrt(mse)

11.014286585021035