Homework 1

january_trips source: https://www.nyc.gov/site/tlc/about/tlc-trip-record-january_trips.page

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__, np.__version__

('2.2.3', '2.2.5')

In [3]:
january_trips = pd.read_parquet(
    '../data/yellow_tripdata_2023-01.parquet'
)
february_trips = pd.read_parquet(
    '../data/yellow_tripdata_2023-02.parquet'
)

In [4]:
january_trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [5]:
january_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [6]:
# Q1: how many columns are there
print(f"# rows = {january_trips.shape[0]} and # columns = {january_trips.shape[1]}")

# rows = 3066766 and # columns = 19


In [7]:
# Q2: computing duration in minutes
# what is the standard deviation of the trips duration in january
january_trips['tpep_pickup_datetime'] = pd.to_datetime(january_trips['tpep_pickup_datetime'])
january_trips['tpep_dropoff_datetime'] = pd.to_datetime(january_trips['tpep_dropoff_datetime'])
january_trips['duration'] = january_trips['tpep_dropoff_datetime'] - january_trips['tpep_pickup_datetime']
january_trips['duration'] = january_trips['duration'].dt.total_seconds().div(60.)

print(f"Standard deviation of trips duration in January : {january_trips.duration.std()}")
print(f"Average of trips duration in January : {january_trips.duration.mean()}")
print(f"Mode of trips duration in January : {january_trips.duration.mode().iloc[0]}")

Standard deviation of trips duration in January : 42.59435124195458
Average of trips duration in January : 15.668995167330452
Mode of trips duration in January : 7.0


In [8]:
# Q3: drop outliers (any trip not between 1 and 60 minutes inclusive)
filtered_january_trips = january_trips[
    (1 <= january_trips.duration) & (january_trips.duration <= 60.)
]
print(f"Fraction of records left after filtering : {filtered_january_trips.shape[0] / january_trips.shape[0]:.2f}")

Fraction of records left after filtering : 0.98


In [9]:
#Q4: one-hot encoding
display(filtered_january_trips[['PULocationID', 'DOLocationID']].head())
# check for NAs
print(f"# NA :\n{filtered_january_trips[['PULocationID', 'DOLocationID']].isna().sum()}")

Unnamed: 0,PULocationID,DOLocationID
0,161,141
1,43,237
2,48,238
3,138,7
4,107,79


# NA :
PULocationID    0
DOLocationID    0
dtype: int64


In [10]:
training_feats_dict = filtered_january_trips[['PULocationID', 'DOLocationID']].astype(str).to_dict('records')
training_feats_dict[:2]

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'}]

In [11]:
from sklearn.feature_extraction import DictVectorizer

In [12]:
vectorizer = DictVectorizer(sparse=False)

vectorizer.fit(training_feats_dict)
X = vectorizer.transform(training_feats_dict)
X[:2]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2, 515))

In [13]:
print(f"# features = {X.shape[1]}")

# features = 515


In [14]:
# Q5: Training a model
# For computation issue, choose a subset of data randomly
# so that I can run it on my laptop
N = 100_000
np.random.seed(42)
train_indices = np.random.choice(range(X.shape[0]), N, replace=False)
train_indices[:5]

array([1475569,  183006, 2788577,  394364, 1123812])

In [15]:
# target is duration
X = X[train_indices]
y = filtered_january_trips['duration'].to_numpy()[train_indices]

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)

In [17]:
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(
    y, model.predict(X)
)
print(f'RMSE on train = {rmse:.2f}')

RMSE on train = 7.61


In [18]:
# Q5: evaluating the model
# validation data: february 2023
# here again, I keep N rows
february_trips['duration'] = (
    pd.to_datetime(february_trips['tpep_dropoff_datetime']) - 
    pd.to_datetime(february_trips['tpep_pickup_datetime'])
).dt.total_seconds().div(60.)

february_trips = february_trips[
    (1 <= february_trips.duration) & (february_trips.duration <= 60.)
]

In [19]:
validation_features_dict = february_trips[['PULocationID', 'DOLocationID']].astype(str).to_dict('records')

In [20]:
print(f"Standard deviation of trips duration in February : {february_trips.duration.std()}")
print(f"Average of trips duration in February : {february_trips.duration.mean()}")
print(f"Mode of trips duration in February : {february_trips.duration.mode().iloc[0]}")

Standard deviation of trips duration in February : 10.064227216640147
Average of trips duration in February : 14.468110313050428
Mode of trips duration in February : 7.0


In [21]:
X_valid = vectorizer.transform(validation_features_dict)

In [22]:
# check dimension
print(f'Validation set dimension: {X_valid.shape}')
print(f'Does it match training feature dimension ? {X.shape[1] == X_valid.shape[1]}')

Validation set dimension: (2855951, 515)
Does it match training feature dimension ? True


In [23]:
X_valid[:2]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2, 515))

In [24]:
valid_indices = np.random.choice(range(X_valid.shape[0]), N, replace=False)
X_valid = X_valid[valid_indices]
y_valid = february_trips.duration.to_numpy()[valid_indices]

In [25]:
valid_rmse = root_mean_squared_error(
    y_valid, model.predict(X_valid)
)
print(f'Validation set RMSE: {valid_rmse:.2f}')

Validation set RMSE: 7.86
