In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

##  Downloading the data

In [21]:
jan_df = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
print("the data has size", jan_df.shape)
jan_df.head(2)

the data has size (3066766, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0


## Computing duration

In [22]:
jan_df['duration'] = jan_df.tpep_dropoff_datetime - jan_df.tpep_pickup_datetime

In [23]:
jan_df.duration = jan_df['duration'].dt.total_seconds().div(60).astype(int)

In [24]:
jan_df.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6


In [25]:
jan_df.duration.describe()

count   3066766.000
mean         15.183
std          42.594
min         -29.000
25%           7.000
50%          11.000
75%          18.000
max       10029.000
Name: duration, dtype: float64

## Dropping outliers

In [26]:
jan_df.shape

(3066766, 20)

In [27]:
jan_df_out = jan_df[(jan_df.duration >= 1) & (jan_df.duration <= 60)]

In [28]:
jan_df_out.shape

(3011044, 20)

In [29]:
((3011044)/3066766)*100

98.18303711466737

## One-hot encoding

In [30]:
categorical = ['PULocationID', 'DOLocationID']
jan_df_out[categorical] = jan_df_out[categorical].astype(str)

In [31]:
train_dicts = jan_df_out[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [32]:
X_train.shape

(3011044, 515)

## Training a model. RMSE on train 

In [33]:
target = 'duration'
y_train = jan_df_out[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

In [34]:
y_pred = lr.predict(X_train)
print(mean_squared_error(y_train, y_pred, squared=False))

7.688653738618748


---

---

In [19]:
import pickle

In [6]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [9]:
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')

In [10]:
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')

In [11]:
len(df_train), len(df_val)

(3009173, 2855951)

In [12]:
#df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
#df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [13]:
categorical = ['PULocationID', 'DOLocationID']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [15]:
print(X_val.shape)
print(X_train.shape)

(2855951, 515)
(3009173, 515)


In [16]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.8118236307935

In [20]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

---