In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def read_data(path):
    temp = pd.read_parquet(path)
    # compute the duration variable
    # returns a time delta object
    td = temp["dropOff_datetime"] - temp["pickup_datetime"]
    temp["duration"] = td.apply(lambda x: x.total_seconds() / 60)
    temp = temp[(temp.duration >= 1) & (temp.duration <= 60)]

    # Missing values

    temp = temp.fillna(value={"PUlocationID":-1})
    
    # One hot encoding

    categorical = ["PUlocationID", "DOlocationID"]
    temp[categorical] = temp[categorical].astype(str)
    
    return temp

In [3]:
df = pd.read_parquet("fhv_tripdata_2021-01.parquet")


In [4]:
# compute the duration variable

# returns a time delta object
td = df["dropOff_datetime"] - df["pickup_datetime"]
df["duration"] = td.apply(lambda x: x.total_seconds() / 60)
print(df["duration"].mean())
print(len(df))
df = df[(df.duration >= 1) & (df.duration <= 60)]
print(len(df))

# Missing values

print("Missing values count for PUlocationID is", df.PUlocationID.isna().sum())

print("Ratio is", df.PUlocationID.isna().sum() / len(df))

df = df.fillna(value={"PUlocationID":-1})

19.1672240937939
1154112
1109826
Missing values count for PUlocationID is 927008
Ratio is 0.8352732770722617


In [6]:
# One hot encoding

categorical = ["PUlocationID", "DOlocationID"]
df[categorical] = df[categorical].astype(str)

In [7]:
train_dicts = df[categorical].to_dict(orient="records")

# turns dictionary into vector
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = "duration"
y_train = df[target].values

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.528519107212382

In [9]:
df_val = read_data("fhv_tripdata_2021-02.parquet")

In [10]:
df_val[categorical] = df_val[categorical].astype(str)

In [11]:
val_dicts = df_val[categorical].to_dict(orient="records")

# turns dictionary into vector
X_val = dv.transform(val_dicts)

target = "duration"
y_val = df_val[target].values

In [12]:
df_val

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.950000
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1.0,225.0,,B00037,13.800000
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1.0,61.0,,B00037,8.966667
...,...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,-1.0,31.0,,B01717,13.533333
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,-1.0,169.0,,B01717,11.466667
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28.0,171.0,,B03285,25.383333
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16.0,252.0,,B03285,18.050000


In [13]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,,,B00009,17.000000
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,,,B00009,17.000000
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.050000
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,8.750000
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,57.600000
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,16.200000
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,19.433333


In [16]:
y_pred_val = lr.predict(X_val)
mean_squared_error(y_val, y_pred_val, squared=False)

11.014283209452627