In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

### Q1. Downloading the data

In [2]:
df_train = pd.read_parquet('yellow_tripdata_2023-01.parquet')
df_valid = pd.read_parquet('yellow_tripdata_2023-02.parquet')

In [3]:
df_train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
# How many columns are there in the data for January 2023?

len(df_train.columns)

19

### Q2. Computing duration

In [5]:
def process_duration(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

In [6]:
df_train = process_duration(df_train)

In [7]:
# What's the standard deviation of the trips duration in January?

df_train['duration'].std()

np.float64(42.59435124195458)

### Q3. Dropping outliers

In [8]:
len_before = len(df_train)
print(f'Before removing outlier: {len_before} rows')
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
print(f'After removing outlier: {len(df_train)} rows')
print(f'Fraction of the records left after you dropped the outliers: {len(df_train)/len_before}')

Before removing outlier: 3066766 rows
After removing outlier: 3009173 rows
Fraction of the records left after you dropped the outliers: 0.9812202822125979


### Q4. One-hot encoding

In [9]:
categorical = ['PULocationID', 'DOLocationID']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_train[target].values

In [10]:
# What's the dimensionality of this matrix (number of columns)?

x_train.shape[1]

515

### Q5. Training a model

In [11]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [12]:
# What's the RMSE on train?

y_pred = lr.predict(x_train)
root_mean_squared_error(y_train, y_pred)

7.649261932106969

### Q6. Evaluating the model

In [13]:
df_valid = process_duration(df_valid)
df_valid = df_valid[(df_valid.duration >= 1) & (df_valid.duration <= 60)]
df_valid[categorical] = df_valid[categorical].astype(str)

valid_dicts = df_valid[categorical].to_dict(orient='records')

# Transform validation data using the fitted DictVectorizer (do not fit again)
x_valid = dv.transform(valid_dicts)

y_valid = df_valid[target].values

In [14]:
x_valid.shape, y_valid.shape

((2855951, 515), (2855951,))

In [15]:
# What's the RMSE on validation?

y_pred = lr.predict(x_valid)
root_mean_squared_error(y_valid, y_pred)

7.811818743246608