In [52]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Read the data

In [53]:
jan_2021_trip_df = pd.read_parquet("/home/mkd/work/MLOps Zoomcamp/data/fhv_tripdata_2021-01.parquet")
feb_2021_trip_df = pd.read_parquet("/home/mkd/work/MLOps Zoomcamp/data/fhv_tripdata_2021-02.parquet")

## Downloading the data
Q1. Read the data for January. How many records are there?

In [54]:
len(jan_2021_trip_df)

1154112

## Computing duration
Q2. What's the average trip duration in January?

In [55]:
jan_2021_trip_df['duration'] = jan_2021_trip_df.dropOff_datetime - jan_2021_trip_df.pickup_datetime
jan_2021_trip_df['duration'] = jan_2021_trip_df['duration'].dt.total_seconds() / 60

In [56]:
jan_2021_trip_df['duration'].mean()

19.1672240937939

## Data preparation
Keep only the records where the duration was between 1 and 60 minutes (inclusive). How many records did you drop?

In [57]:
jan_2021_trip_df_filtered = jan_2021_trip_df[(jan_2021_trip_df['duration'] >= 1.0) & (jan_2021_trip_df['duration'] <= 60.0)]

In [58]:
len(jan_2021_trip_df) - len(jan_2021_trip_df_filtered)

44286

## Missing values
Q3. What's the fractions of missing values for the pickup location ID?

In [59]:
jan_2021_trip_df_filtered['PUlocationID'].isna().sum() / len(jan_2021_trip_df_filtered)

0.8352732770722617

In [60]:
locationIDs = ['PUlocationID', 'DOlocationID']
jan_2021_trip_df_filtered[locationIDs] = jan_2021_trip_df_filtered[locationIDs].fillna(-1).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_2021_trip_df_filtered[locationIDs] = jan_2021_trip_df_filtered[locationIDs].fillna(-1).astype('int')


## One-hot encoding
Q4. What's the dimension of the one-hot encoded matrix?

In [61]:
jan_2021_trip_df_filtered[locationIDs] = jan_2021_trip_df_filtered[locationIDs].astype('str')
train_dicts = jan_2021_trip_df_filtered[locationIDs].to_dict(orient='records')
train_dicts_dict_vectorizer = DictVectorizer()
X_train = train_dicts_dict_vectorizer.fit_transform(train_dicts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_2021_trip_df_filtered[locationIDs] = jan_2021_trip_df_filtered[locationIDs].astype('str')


## Training a model
Q5. Train a plain linear regressor on the data and calculate RMSE on train data.

In [62]:
y_train = jan_2021_trip_df_filtered['duration']

In [63]:
duration_predictor = LinearRegression()
duration_predictor.fit(X_train, y_train)

LinearRegression()

In [64]:
y_train_pred = duration_predictor.predict(X_train)

In [65]:
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_rmse

10.528519107212292

## Evaluating the model
Q6. What's the RMSE on validation?

In [66]:
feb_2021_trip_df['duration'] = feb_2021_trip_df.dropOff_datetime - feb_2021_trip_df.pickup_datetime
feb_2021_trip_df['duration'] = feb_2021_trip_df['duration'].dt.total_seconds() / 60
feb_2021_trip_df_filtered = feb_2021_trip_df[(feb_2021_trip_df['duration'] >= 1.0) & (feb_2021_trip_df['duration'] <= 60.0)]
feb_2021_trip_df_filtered[locationIDs] = feb_2021_trip_df_filtered[locationIDs].fillna(-1).astype('int')
feb_2021_trip_df_filtered[locationIDs] = feb_2021_trip_df_filtered[locationIDs].astype('str')
val_dicts = feb_2021_trip_df_filtered[locationIDs].to_dict(orient='records')
X_val = train_dicts_dict_vectorizer.transform(val_dicts)
y_val = feb_2021_trip_df_filtered['duration']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feb_2021_trip_df_filtered[locationIDs] = feb_2021_trip_df_filtered[locationIDs].fillna(-1).astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feb_2021_trip_df_filtered[locationIDs] = feb_2021_trip_df_filtered[locationIDs].astype('str')


In [67]:
y_val_pred = duration_predictor.predict(X_val)

In [68]:
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
val_rmse

11.014283211122269