In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

## Read the data for January. How many columns are there?

In [3]:
df.shape[1]

19

## Now let's compute the duration variable.

In [4]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

In [5]:
df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)

## Standard deviation of the trips duration in January

In [6]:
df['duration'].std()

42.594351241920904

## What fraction of the records left after you dropped the outliers?

In [7]:
df[(df['duration'] >= 1) & (df['duration'] <= 60)].shape[0] / df.shape[0] * 100

98.1220282212598

In [8]:
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

## Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- ### Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- ### Fit a dictionary vectorizer
- ### Get a feature matrix from it
## What's the dimensionality of this matrix (number of columns)?



In [9]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

In [10]:
train_dict = df[categorical].to_dict(orient='records')

## X_train

In [11]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

## y_train

In [12]:
y_train = df['duration'].values
y_train

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

## Now let's use the feature matrix from the previous step to train a model.

- ### Train a plain linear regression model with default parameters, where duration is the response variable
- ### Calculate the RMSE of the model on the training data
## What's the RMSE on train?

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [14]:
y_pred_train = lr.predict(X_train)

In [15]:
mean_squared_error(y_train, y_pred_train, squared=False)

7.649261930819891

## Now let's apply this model to the validation dataset (February 2023).

## What's the RMSE on validation?

In [16]:
categorical = ['PULocationID', 'DOLocationID']
def prepare_df(file_name):
    df = pd.read_parquet(file_name)
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df[categorical] = df[categorical].astype(str)
    return df

In [17]:
def get_X_and_y(df):
    dv = DictVectorizer()
    cat_dict = df[categorical].to_dict(orient='records')
    X_ = dv.fit_transform(cat_dict)
    y_ = df['duration'].values
    return X_, y_ 

In [18]:
train_df = prepare_df('./data/yellow_tripdata_2023-01.parquet')


## The Feb dataset has only 514 categorical features ('PULocationID', 'DOLocationID') as a result. So we need to make sure that we train our model with the same amount of features or we will see an error. Let's remove one PULocationID with the lowest accurance

In [26]:
train_df = train_df.drop(train_df[train_df['PULocationID'] == '42'].index)

In [27]:
X_train, y_train = get_X_and_y(train_df)

In [23]:
valid_df = prepare_df('./data/yellow_tripdata_2023-02.parquet')
X_valid, y_valid = get_X_and_y(valid_df)

In [29]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_valid)

In [30]:
mean_squared_error(y_valid, y_pred, squared=False)

18.164762054860947