In [48]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [10]:
df_jan = pd.read_parquet('/content/yellow_tripdata_2023-01.parquet')
df_feb = pd.read_parquet('/content/yellow_tripdata_2023-02.parquet')

## Q1. Downloading the data
###  How many columns are there?

In [11]:
df_jan.shape

(3066766, 19)

There is 3066766 rows (records) & 19 columns

## Q2. Computing duration
### What's the standard deviation of the trips duration in January?

In [17]:
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

In [19]:
df_jan['duration'].std()

42.594351241920904

## Q3. Dropping outliers
### What fraction of the records left after you dropped the outliers?

In [20]:
df_jan_withou_outliers = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)]

In [22]:
print(f'The fraction of the records left after you dropped the outliers : {len(df_jan_withou_outliers) / len(df_jan) * 100:.2f}%')

The fraction of the records left after you dropped the outliers : 98.12%


## Q4. One-hot encoding
### What's the dimensionality of this matrix (number of columns)?

In [23]:
df_jan_withou_outliers.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [24]:
# choosing only pickup and dropoff location IDs to be features for the model
categorical_features = ['PULocationID', 'DOLocationID']
numerical_target = ['duration']

In [27]:
model_data = df_jan_withou_outliers.loc[: , categorical_features + numerical_target]

# Ensure that the pickup and dropoff location IDs are strings because the
# vectorizer accepts string features only
model_data[categorical_features] = model_data[categorical_features].astype(str)

In [30]:
train_dicts = model_data[categorical_features].to_dict(orient='records')

In [33]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [45]:
print(f'dimensionality of this matrix (number of columns) after applying One-hot encoding : {len(dv.feature_names_)}')

dimensionality of this matrix (number of columns) after applying One-hot encoding : 515


## Q5. Training a model
### What's the RMSE on train?

In [47]:
y_train = model_data[numerical_target]
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [49]:
train_RMSE = root_mean_squared_error(y_train, lr_model.predict(X_train))
print(f'RMSE on train : {train_RMSE:.2f}')

RMSE on train : 7.65


## Q6. Evaluating the model
### What's the RMSE on validation?

In [55]:
# First Preprocess the validation data
df_feb['duration'] = (df_feb['lpep_dropoff_datetime'] - df_feb['lpep_pickup_datetime']).dt.total_seconds() / 60

# Remove The Outliers
df_feb_without_outliers = df_feb[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)]

# choosing only pickup and dropoff location IDs to be features for the model
validation_data = df_feb_without_outliers.loc[:,categorical_features + numerical_target]

# Ensure that the pickup and dropoff location IDs are strings because the
# vectorizer accepts string features only
validation_data[categorical_features] = validation_data[categorical_features].astype(str)

# Apply Vectorizer on validation data
val_dicts = validation_data[categorical_features].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = validation_data[numerical_target]

In [56]:
val_RMSE = root_mean_squared_error(y_val, lr_model.predict(X_val))
print(f'RMSE on validation : {val_RMSE:.2f}')

RMSE on validation : 10.35
