## Preparation

Import all necessary libraries

In [1]:
# data
import pandas as pd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# ml-related things
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# typing
import numpy as np
from typing import List, Tuple
from pandas.core.frame import DataFrame
from scipy.sparse.csr import csr_matrix

Data processing functions and pipeline:

In [25]:
def read_data(filename: str) -> DataFrame:
    """ Read parquet data and add duration in minutes"""
    df = pd.read_parquet(filename)
    
    # get duration in minutes
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

def remove_outliers(df: DataFrame, t_min: int=1, t_max: int=60) -> DataFrame:
    """ Remove duration outliers"""
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    return df

def fill_na(df: DataFrame, columns: List[str]) -> DataFrame:
    """ Replace NaN with -1 """
    for column in columns:
        df[column] = df[column].fillna(-1)
    return df

def one_hot_encoding(df: DataFrame, columns: List[str], target: str='duration', mode: str='train', dv: DictVectorizer=DictVectorizer()) -> Tuple[csr_matrix, np.array, DictVectorizer]:
    """ 
    One hot encoding for categorical variables.
    + prepare ground truth data 
    """
    
    df[columns] = df[columns].astype(str)

    # records dictionaries
    rec_dicts = df[columns].to_dict(orient='records')
    
    # === Train or Predict modes === #
    if mode=='train':
        # Learn a list of feature name -> indices mappings and transform X.
        X = dv.fit_transform(rec_dicts)
        y = df[target].values
        return X, y, dv
    
    else:
        # Transform feature -> value dicts to array or sparse matrix.
        X = dv.transform(rec_dicts)
        y = df[target].values
        return X, y, dv 

def data_process(filename: str, mode: str, dv: DictVectorizer=DictVectorizer()) -> Tuple[csr_matrix, np.array]:
    """ Complete data processing pipeline"""
    df = read_data(filename)
    df = remove_outliers(df)
    
    categorical = ['PUlocationID', 'DOlocationID']
    df = fill_na(df, categorical)
    X, y, dv = one_hot_encoding(df, categorical, 'duration', mode, dv)
    return X, y 

ML model functions:

In [31]:
def train_LR(X: csr_matrix, y: np.array) -> LinearRegression:
    """ """
    lr = LinearRegression()
    lr.fit(X, y)
    return lr

def predict_LR(lr: LinearRegression, X: csr_matrix, y: np.array) -> float:
    """ """
    y_pred = lr.predict(X)

    mse = mean_squared_error(y, y_pred, squared=False)
    return round(mse,2)

## Q1. Downloading the data

We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records".

Download the data for January and February 2021.

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. **How many records are there?**

In [4]:
df = read_data('./data/fhv_tripdata_2021-01.parquet')

N_full = len(df)

print(f'Total number of rows is {N_full}')

Total number of rows is 1154112


## Q2. Computing duration

Now let's compute the `duration` variable. It should contain the duration of a ride in minutes.

**What's the average trip duration in January?**

In [5]:
print(f'Average trip duration is {round(df.duration.mean(),2)}')

Average trip duration is 19.17


## Data preparation

Check the distribution of the duration variable. There are some outliers.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

**How many records did you drop?**

In [6]:
df = remove_outliers(df)

N_select = len(df)

print(f'We removed {N_full-N_select} data outliers')

We removed 44286 data outliers


## Q3. Missing values

The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1".

**What's the fractions of missing values for the pickup location ID?**

In [7]:
categorical = ['PUlocationID', 'DOlocationID']

df = fill_na(df, categorical)

rate_nan = round(df.PUlocationID.value_counts(normalize=True).iloc[0]*100.,2)
print(f'The fractions of missing values for the pickup location ID is {rate_nan}')

The fractions of missing values for the pickup location ID is 83.53


## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer
* Get a feature matrix from it

**What's the dimensionality of this matrix?** (The number of columns).

In [8]:
X_train, y_train, dv = one_hot_encoding(df, categorical, target='duration', mode='train')

print(f'The dimensionality of the feature matrix is {X_train.shape}')
print(f'Number of columns is {X_train.shape[1]}')

The dimensionality of the feature matrix is (1109826, 525)
Number of columns is 525


## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

* Train a plain linear regression model with default parameters
* Calculate the RMSE of the model on the training data

**What's the RMSE on train?**

In [10]:
model = train_LR(X=X_train, y=y_train)
metric = predict_LR(model, X_train, y_train)

print(f'RMSE on train is {metric}')

RMSE on train is 10.53


## Q6. Evaluating the model

Now let's apply this model to the validation dataset (Feb 2021).

**What's the RMSE on validation?**

In [26]:
X_test, y_test = data_process('./data/fhv_tripdata_2021-02.parquet', 'test', dv)

metric = predict_LR(model, X_test, y_test)

print(f'RMSE on train is {metric}')


