In [19]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df_january = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
df_february = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")

### Q1. Downloading the data

In [4]:
df_january.shape[1]

19

### Q2. Computing duration

In [5]:
df_january['duration'] = df_january.tpep_dropoff_datetime - df_january.tpep_pickup_datetime
df_january['duration'] = df_january['duration'].apply(lambda x : x.total_seconds() / 60)

In [6]:
df_january.duration.std()

42.594351241920904

### Q3. Dropping outliers

In [7]:
((df_january.duration >= 1) & (df_january.duration <= 60)).mean()

0.9812202822125979

In [8]:
df_january = df_january[(df_january.duration >= 1) & (df_january.duration <= 60)]

### Q4. One-hot encoding

In [9]:
columns = ['PULocationID', 'DOLocationID']

In [10]:
df_january[columns] = df_january[columns].astype(str)

In [11]:
train_dicts = df_january[columns].to_dict(orient='records')

In [12]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [13]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

### Q5. Training a model

In [14]:
target = 'duration'
y_train = df_january[target].values

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [17]:
y_pred = lr.predict(X_train)

In [21]:
mean_squared_error(y_train, y_pred, squared=False)

7.649261932106969

### Q6. Evaluating the model

In [29]:
df_february['duration'] = df_february.tpep_dropoff_datetime - df_february.tpep_pickup_datetime
df_february['duration'] = df_february['duration'].apply(lambda x : x.total_seconds() / 60)

In [30]:
df_february = df_february[(df_february.duration >= 1) & (df_february.duration <= 60)]

In [31]:
columns = ['PULocationID', 'DOLocationID']

In [32]:
df_february[columns] = df_february[columns].astype(str)

In [33]:
test_dicts = df_february[columns].to_dict(orient='records')

In [34]:
X_test = dv.transform(test_dicts)

In [35]:
y_test = df_february[target].values

In [36]:
y_pred = lr.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

7.811818743246608