# 01-Intro Homework
NYK: Taxi data: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [1]:
!which python

/Users/mmukherjee/Documents/LearningAndDevelopment/DataTalksClub/mlops-zoomcamp/cohorts/2023/02-experiment-tracking/venv_02/bin/python


In [8]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

### 1. EDA

In [None]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
print(f"df.shape: {df.shape}")
df.head(3)

### 2. Calculate derived column 'Duration'

In [None]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
# Find the standard deviation of the "duration"
print(f"Standard div of the 'duration' column: {round(df['duration'].std(),2)}")

### 3. One-hot encoding using DictVectorizer

In [None]:
cat_vars = ["PULocationID", "DOLocationID"]

In [None]:
df[cat_vars].astype('str').dtypes

In [None]:
df[cat_vars] = df[cat_vars].astype(str)

In [None]:
df.dtypes

## 4. Refactoring: Reusable pipeline

#### 4.1 Read the parquet file

In [5]:
def read_dataframe(filename):
    print(f"Input filename: {filename}")
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    original_shape = df.shape
    print(f"before filtering shape: {original_shape}")
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    modified_shape = df.shape
    print(f"after filtering shape: {modified_shape}")
    
    left_percentage = round(modified_shape[0]*100/original_shape[0],2)
    print(f"Percentage of records left in df after removing outliers: {left_percentage}%")
    
    categorical = ["PULocationID", "DOLocationID"] 
    df[categorical] = df[categorical].astype(str)
    print()
    return df

In [9]:
df_train = read_dataframe('./data/yellow_tripdata_2022-01.parquet')
df_val   = read_dataframe('./data/yellow_tripdata_2022-02.parquet')

print(f"shape of df_train: {df_train.shape}")
print(f"shape of df_val. : {df_val.shape}")

Input filename: ./data/yellow_tripdata_2022-01.parquet
before filtering shape: (2463931, 20)
after filtering shape: (2421440, 20)
Percentage of records left in df after removing outliers: 98.28%

Input filename: ./data/yellow_tripdata_2022-02.parquet
before filtering shape: (2979431, 20)
after filtering shape: (2918187, 20)
Percentage of records left in df after removing outliers: 97.94%

shape of df_train: (2421440, 20)
shape of df_val. : (2918187, 20)


In [10]:
categorical = ["PULocationID", "DOLocationID"] 

In [11]:
train_dicts      = df_train[categorical].to_dict(orient="records")
validation_dicts = df_val[categorical].to_dict(orient="records")

#### 4.2 Configure categorical features and output column

In [12]:
dv = DictVectorizer()
#Model's feature metrics 
X_train = dv.fit_transform(train_dicts)
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [13]:
X_validation = dv.transform(validation_dicts)
X_validation

<2918187x515 sparse matrix of type '<class 'numpy.float64'>'
	with 5836368 stored elements in Compressed Sparse Row format>

In [14]:
# target variable 
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

#### 4.3 Build model

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

train_RMSE = round(mean_squared_error(y_train, y_pred, squared=False),2)
print(f"Train RMSE: {train_RMSE}")

In [None]:
sns.distplot(y_pred, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend()

In [None]:
y_val_pred = lr.predict(X_validation)

validation_RMSE = round(mean_squared_error(y_val, y_val_pred, squared=False),2)
print(f"Validation RMSE: {validation_RMSE}")

In [None]:
sns.distplot(y_val_pred, label='prediction')
sns.distplot(y_val, label='actual')

plt.legend()

### 5. Experiment Tracking using MLFlow

In [19]:
!which mlflow

/Users/mmukherjee/Documents/LearningAndDevelopment/DataTalksClub/mlops-zoomcamp/cohorts/2023/02-experiment-tracking/venv_02/bin/mlflow


In [15]:
import mlflow

In [16]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [17]:
mlflow.set_experiment("yellow-taxi-exp")

<Experiment: artifact_location='/Users/mmukherjee/Documents/LearningAndDevelopment/DataTalksClub/mlops-zoomcamp/cohorts/2023/02-experiment-tracking/mlruns/1', creation_time=1684689271824, experiment_id='1', last_update_time=1684689271824, lifecycle_stage='active', name='yellow-taxi-exp', tags={}>

In [20]:
with mlflow.start_run():
    mlflow.set_tag("Developer", "Manas")
    mlflow.log_param("dataset", "'./data/yellow_tripdata_2022-01.parquet'")
    mlflow.log_param("model-algo", "LinearRegression")
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)

    train_RMSE = round(mean_squared_error(y_train, y_pred, squared=False),2)
    print(f"Train RMSE: {train_RMSE}")
    mlflow.log_metric("train_RMSE", train_RMSE)

Train RMSE: 6.99


In [21]:
with mlflow.start_run():
    mlflow.set_tag("Developer", "Manas")
    mlflow.log_param("dataset", "'./data/yellow_tripdata_2022-01.parquet'")
    mlflow.log_param("model-algo", "Lasso")
    
    alpha = 0.01
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)

    train_RMSE = round(mean_squared_error(y_train, y_pred, squared=False),2)
    print(f"Train RMSE: {train_RMSE}")
    mlflow.log_metric("train_RMSE", train_RMSE)

Train RMSE: 7.33
