# IMPORTS ===============

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error as rmse

# HELPERS ===============

# Homework

1. Number of columns for Jan 2023: `19` columns
2. Duration standard deviation for 2023/01: `42.59` minutes
3. Records left after dropout (1--60, inclusive): `~98%`
4. One-hot encoding dimensions for categorical data: `515`
5. RMSE on train: 7.65
6. RMSE on validation: 7.81

# CODE =================

In [2]:
df_jan = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df_feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

df = df_jan.copy()

In [3]:
print(df.shape)
df.head()

(3066766, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
# Q2 - Duration std
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(pd.Timedelta.total_seconds)/60
duration_mean, duration_std = df.duration.mean(), df.duration.std()
print(duration_mean, duration_std)

15.668995167330452 42.59435124195458


In [5]:
# Q3 - Dropping outliers (1--60, inclusive)
duration_keep_mask = (df.duration >= 1) & (df.duration <= 60)
records_left_fraction = sum(duration_keep_mask) / len(duration_keep_mask)
df = df[duration_keep_mask]

print(records_left_fraction)

0.9812202822125979


In [6]:
# Q4 - One-hot encoding
categorical = ['PULocationID', 'DOLocationID']
# numerical = ['trip_distance']
df[categorical] = df[categorical].astype(str)
train_dicts = df[categorical].to_dict(orient="records")

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

X_train.shape  # 515 for these categorical variables

(3009173, 515)

In [7]:
# Q5 - RMSE on train
y_train = df["duration"]
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
rmse_train = rmse(y_train, y_pred)
print(rmse_train)

7.649261934850555


In [None]:
# Q6 - RMSE on validation

# Prepare validation data
df_valid = df_feb
df_valid["duration"] = (df_valid.tpep_dropoff_datetime - df_valid.tpep_pickup_datetime).apply(pd.Timedelta.total_seconds)/60
df_valid = df_valid[(df_valid.duration >=1) & (df_valid.duration <=60)]
df_valid[categorical] = df_valid[categorical].astype(str)
# Transform data
X_valid = dv.transform(df_valid[categorical].to_dict(orient="records"))
y_valid = df_valid.duration
# Predict and calculate RMSE
y_pred2 = lr.predict(X_valid)
rmse_valid = rmse(y_valid, y_pred2)
print(rmse_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid[categorical] = df_valid[categorical].astype(str)


7.811817745843695


In [9]:
print(f"1. Comumns: {df.shape[1]:d}")
print(f"2. Duration std: {duration_std:.2f}")
print(f"3. Records left after outlier dropped: {records_left_fraction:.1%}")
print(f"4. One-hot encoding dimension of categorical: {X_train.shape[1]}")
print(f"5. RMSE on train: {rmse_train:.2f}")
print(f"6. RMSE on validation: {rmse_valid:.2f}")

1. Comumns: 20
2. Duration std: 42.59
3. Records left after outlier dropped: 98.1%
4. One-hot encoding dimension of categorical: 515
5. RMSE on train: 7.65
6. RMSE on validation: 7.81


# MISCELLANEOUS =========