In [19]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [20]:
Jan_2023 = '../data/yellow_tripdata_2023-01.parquet'
Feb_2023 = '../data/yellow_tripdata_2023-02.parquet'

In [21]:
categorical = ['PULocationID', 'DOLocationID']

def load(file: str) -> pd.DataFrame:
  df = pd.read_parquet(file)

  df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
  df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

  df[categorical] = df[categorical].astype(str)

  return df

### Q1: How many columns are there?

In [22]:
df_train = load(Jan_2023)

In [23]:
len(df_train.columns)

20

### Q2: What's the standard deviation of the trips duration in January?

In [24]:
df_train.duration.describe()['std']

42.594351241920904

### Q3: What fraction of the records left after you dropped the outliers?

In [25]:
cleaned_df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

In [26]:
(len(cleaned_df_train) / len(df_train)) * 100

98.1220282212598

### Q4: What's the dimensionality of this matrix (number of columns)?

In [27]:
train_dict = cleaned_df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
x_train = dv.fit_transform(train_dict)
y_train = cleaned_df_train.duration.values

In [28]:
x_train.shape

(3009173, 515)

### Q5: What's the RMSE on train?

In [29]:
lr = LinearRegression()

In [30]:
lr.fit(x_train, y_train)

LinearRegression()

In [31]:
y_train_pred = lr.predict(x_train)

In [32]:
mean_squared_error(y_train, y_train_pred, squared=False)

7.649261027855596

In [33]:
del df_train, train_dict, x_train, y_train, y_train_pred

### Q6: What's the RMSE on validation?

In [39]:
df_validation = load(Feb_2023)
cleaned_df_validation = df_validation[(df_validation.duration >= 1) & (df_validation.duration <= 60)]

In [40]:
val_dict = cleaned_df_validation[categorical].to_dict(orient='record')
y_val = cleaned_df_validation.duration.values

del cleaned_df_validation

x_val = dv.transform(val_dict)

  val_dict = cleaned_df_validation[categorical].to_dict(orient='record')


In [41]:
y_val_pred = lr.predict(x_val)

In [42]:
mean_squared_error(y_val, y_val_pred, squared=False)

7.8118325109416915