In [2]:
from google.colab import files
uploaded = files.upload()

Saving yellow_tripdata_2022-01.parquet to yellow_tripdata_2022-01 (1).parquet
Saving yellow_tripdata_2022-02.parquet to yellow_tripdata_2022-02 (1).parquet


In [3]:
!pip install pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [5]:
df_january = pd.read_parquet('yellow_tripdata_2022-01.parquet')
df_february = pd.read_parquet('yellow_tripdata_2022-02.parquet')

In [6]:
# Q1
print(f"Number of cols for January Data: {len(df_january.columns)}")

Number of cols for January Data: 19


In [7]:
df_january.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463931 entries, 0 to 2463930
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [8]:
# Q2
df_january['duration'] = df_january.tpep_dropoff_datetime - df_january.tpep_pickup_datetime
df_january['duration'] = df_january.duration.dt.total_seconds() / 60
std_dev = df_january.duration.std()
print(f"Standard deviation of the trips duration in January: {round(std_dev, 2)}")

Standard deviation of the trips duration in January: 46.45


In [9]:
# Q3
filtered = df_january.copy()
filtered = filtered[(filtered.duration >= 1) & (filtered.duration <= 60)]
original_rows = df_january.shape[0]
filtered_rows = filtered.shape[0]

removed_rows = original_rows - filtered_rows
fraction_removed = removed_rows / original_rows * 100
print(f"Fraction of the records left after you dropped the outliers: {int(100 - fraction_removed)}%")

Fraction of the records left after you dropped the outliers: 98%


In [10]:
# Q4

df = filtered.copy()
# Clean data
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].fillna(-1).astype('int')
df[categorical] = df[categorical].astype('str')

In [11]:
# Turn the dataframe into a list of dictionaries (col names as keys)
train_dicts = df[categorical].to_dict(orient='records')

In [12]:
len(train_dicts)

2421440

In [13]:
#Fit a dictionary vectorizer
dv = DictVectorizer()

In [14]:
# Feature matrix (to train the model)
X_train = dv.fit_transform(train_dicts) 
print(X_train.shape)

(2421440, 515)


In [15]:
print(f"Dimensionality of this matrix (number of columns): {len(dv.feature_names_)}")

Dimensionality of this matrix (number of columns): 515


In [16]:
# Q5
y_train = df.duration.values

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [18]:
y_pred = lr.predict(X_train)
mse = mean_squared_error(y_train, y_pred, squared=False)
print(f"RMSE on train: {round(mse, 2)}")

RMSE on train: 6.99


In [24]:
def process_data(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    return df

In [26]:
test_df = df_february.copy()
df_val = process_data(test_df)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts) 
y_pred = lr.predict(X_val)
y_val = df_val.duration.values
mse_test = mean_squared_error(y_val, y_pred, squared=False)

In [27]:
print(f"RMSE on test: {round(mse_test, 2)}")

RMSE on test: 7.79
