In [1]:
!python --version

Python 3.10.4


In [2]:
import pandas as pd

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [4]:
from typing import List, Tuple, Optional

In [5]:
df_jan = pd.read_parquet("../data/fhv_tripdata_2021-01.parquet")
df_feb = pd.read_parquet("../data/fhv_tripdata_2021-02.parquet")

In [6]:
def get_closest(value: float, references: List[float]) -> float:
    closest_dist = float("inf")
    closest_value = float("inf")
    for ref in references:
        dist = abs(ref - value)
        if dist < closest_dist:
            closest_dist = dist
            closest_value = ref
    return closest_value

## Q1. Downloading the data

Read the data for January. How many records are there?

In [7]:
q1 = df_jan.size
q1

8078784

In [8]:
get_closest(q1, [1054112, 1154112, 1254112, 1354112])

1354112

## Q2. Computing duration

What's the average trip duration in January?

In [9]:
def get_duration(df: pd.DataFrame) -> pd.Series:
    duration = df.dropOff_datetime - df.pickup_datetime
    return duration.apply(lambda td: td.total_seconds() / 60)

df_jan["duration"] = get_duration(df_jan)
df_feb["duration"] = get_duration(df_feb)
q2 = df_jan["duration"].mean()
q2

19.167224093791006

In [10]:
get_closest(q2, [15.16, 19.16, 24.16, 29.16])

19.16

## Data preparation

In [11]:
def filter_outliers(df: pd.DataFrame) -> pd.DataFrame:
    return df[(df["duration"] >= 1) & (df["duration"] <= 60)]

df_jan = filter_outliers(df_jan)
df_feb = filter_outliers(df_feb)

## Q3. Missing values

What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

In [12]:
def get_missing_percent(df: pd.DataFrame, column: str) -> float:
    return df[column].isna().mean()

q3 = get_missing_percent(df_jan, "PUlocationID")
q3

0.8352732770722617

In [13]:
get_closest(q3, [0.53, 0.63, 0.73, 0.83])

0.83

## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer 
* Get a feature matrix from it

What's the dimensionality of this matrix? (The number of columns)

In [14]:
def onehot(
    df: pd.DataFrame,
    columns: List[str],
    dv: Optional[DictVectorizer] = None,
) -> Tuple[pd.DataFrame, DictVectorizer]:
    df_dicts = df[columns].astype(str).to_dict(orient="records")
    if dv is None:
        dv = DictVectorizer()
        dv.fit(df_dicts)
    df_onehot = dv.transform(df_dicts)
    return df_onehot, dv

q4 = onehot(df_jan, ["PUlocationID", "DOlocationID"])[0].shape[1]
q4

525

In [15]:
get_closest(q4, [2, 152, 352, 525, 725])

525

## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model. 

* Train a plain linear regression model with default parameters 
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [16]:
def preprocess(
    df: pd.DataFrame,
    dv: Optional[DictVectorizer] = None,
) -> Tuple[pd.DataFrame, pd.Series, DictVectorizer]:
    df_x, dv_ = onehot(df, columns=["PUlocationID", "DOlocationID"], dv=dv)
    return df_x, df["duration"], dv_

In [17]:
def train_lin_reg(
    df: pd.DataFrame,
) -> Tuple[LinearRegression, DictVectorizer, float]:
    X_train, y_train, dv = preprocess(df)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)
    rmse = mean_squared_error(y_train.values, y_pred, squared=False)

    return lr, dv, rmse

model, dv, rmse_train = train_lin_reg(df_jan)
q5 = rmse_train
q5

10.528519400124226

In [18]:
get_closest(q5, [5.52, 10.52, 15.52, 20.52])

10.52

## Q6. Evaluating the model

Now let's apply this model to the validation dataset. 

What's the RMSE on validation?

In [19]:
def eval_lin_reg(
    model: LinearRegression,
    dv: DictVectorizer,
    df: pd.DataFrame,
) -> float:
    X_val, y_val, _ = preprocess(df, dv)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val.values, y_pred, squared=False)

    return rmse

q6 = eval_lin_reg(model, dv, df_feb)
q6

11.014286484047036

In [20]:
get_closest(q5, [7.85, 12.85, 17.85, 22.85])

12.85