In [2]:
!python -V

Python 3.10.6


In [5]:
import prefect
prefect.__version__

'2.10.8'

In [1]:
import pandas as pd

import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [7]:
jan_df = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')

In [5]:
old_len = len(jan_df)
print(old_len)

2463931


In [6]:
# Number 1
print('The are', jan_df.shape[1], 'columns in the January dataset')

The are 19 columns in the January dataset


In [7]:
jan_df['duration'] = jan_df.tpep_dropoff_datetime - jan_df.tpep_pickup_datetime
jan_df.duration = jan_df.duration.apply(lambda td: td.total_seconds() / 60)

In [8]:
# Number 2
print('The standard deviation of the trips duration in January', jan_df.duration.std())

The standard deviation of the trips duration in January 46.44530513776802


In [9]:
jan_df = jan_df[(jan_df.duration >= 1) & (jan_df.duration <= 60)]

In [10]:
new_len= len(jan_df)
print(new_len)

2421440


In [11]:
# Number 3
print('The fraction of the records left after we dropped the outliers is', ((new_len/old_len)*100))

The fraction of the records left after we dropped the outliers is 98.27547930522405


In [12]:
categorical = ['PULocationID', 'DOLocationID']

jan_df[categorical] = jan_df[categorical].astype(str)

In [13]:
train_dicts = jan_df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [14]:
# Number 4
print('The dimensionality of the matrix is', X_train.shape[1])

The dimensionality of the matrix is 515


In [15]:
target = 'duration'
y_train = jan_df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

In [16]:
# Number 5
y_pred = lr.predict(X_train)

train_rmse=mean_squared_error(y_train, y_pred, squared=False)

In [17]:
print('The RMSE on train dataset is', train_rmse)

The RMSE on train dataset is 6.986191065500608


In [18]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [19]:
df_val = read_dataframe('data/yellow_tripdata_2022-02.parquet')

In [20]:
df_val.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,1,2022-02-01 00:06:58,2022-02-01 00:19:24,1.0,5.4,1.0,N,138,252,1,17.0,1.75,0.5,3.9,0.0,0.3,23.45,0.0,1.25,12.433333
1,1,2022-02-01 00:38:22,2022-02-01 00:55:55,1.0,6.4,1.0,N,138,41,2,21.0,1.75,0.5,0.0,6.55,0.3,30.1,0.0,1.25,17.55
2,1,2022-02-01 00:03:20,2022-02-01 00:26:59,1.0,12.5,1.0,N,138,200,2,35.5,1.75,0.5,0.0,6.55,0.3,44.6,0.0,1.25,23.65
3,2,2022-02-01 00:08:00,2022-02-01 00:28:05,1.0,9.88,1.0,N,239,200,2,28.0,0.5,0.5,0.0,3.0,0.3,34.8,2.5,0.0,20.083333
4,2,2022-02-01 00:06:48,2022-02-01 00:33:07,1.0,12.16,1.0,N,138,125,1,35.5,0.5,0.5,8.11,0.0,0.3,48.66,2.5,1.25,26.316667


In [21]:
val_dicts = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

In [22]:
X_val.shape

(2918187, 515)

In [23]:
target = 'duration'
y_val = df_val[target].values

In [24]:
# Number 6
y_pred_val = lr.predict(X_val)

val_rmse=mean_squared_error(y_val, y_pred_val, squared=False)

In [25]:
print('The RMSE on validation dataset is', val_rmse)

The RMSE on validation dataset is 7.786408015215065
