In [1]:
import pandas as pd
import mlflow

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

import xgboost

In [2]:
jan_df = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')
feb_df = pd.read_parquet('data/yellow_tripdata_2022-02.parquet')
mar_df = pd.read_parquet('data/yellow_tripdata_2022-03.parquet')

In [2]:
dv = DictVectorizer()
    
def preprocess(data: str, fit=False, dv=dv):
    df = pd.read_parquet(data)
    
    df['day'] = df.tpep_pickup_datetime.dt.day
    df['hour'] = df.tpep_pickup_datetime.dt.hour
    df['minute'] = df.tpep_pickup_datetime.dt.minute
    df['second'] = df.tpep_pickup_datetime.dt.second
    
    cols = ['trip_distance', 'PULocationID', 'DOLocationID', 'day', 'hour', 'minute', 'second']

    col_df= df[cols].astype('str')

    X_train = col_df.to_dict(orient='records')
    
    if fit:
        X_train = dv.fit_transform(X_train)
    else:
        X_train = dv.transform(X_train)

    Y_train = df.fare_amount.values
    
    return X_train, Y_train

In [3]:
# mlflow.end_run()

In [4]:
X_train, y_train = preprocess('data/yellow_tripdata_2022-01.parquet', fit=True)
# X_test, y_test = preprocess('data/yellow_tripdata_2022-02.parquet')
# X_val, y_val = preprocess('data/yellow_tripdata_2022-03.parquet')

In [8]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('nyc-taxi-experiment2')

with mlflow.start_run():
    
    mlflow.set_tag('Developer', 'Moses')
    print('Logging Started...')

    lr = LinearRegression(n_jobs=-1)
    print('LR Initiated')

    lr.fit(X_train, y_train)
    print('LR fitted')


    y_pred = lr.predict(X_train)

    rmse = mean_squared_error(y_train, y_pred, squared=False)
    mlflow.log_metric('Rmse', rmse)

Logging Started...
LR Initiated
LR fitted


In [10]:
from mlflow.tracking import MlflowClient

In [11]:
client = MlflowClient('http://127.0.0.1:5000')

In [15]:
client.get_registered_model(name='nyc-taxi-experiment2')

RestException: RESOURCE_DOES_NOT_EXIST: Registered Model with name=nyc-taxi-experiment2 not found

# XGB Experiment

In [None]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('nyc-taxi-experiment1')

mlflow.sklearn.autolog()
mlflow.set_tag('Developer', 'Moses')
print('Logging Started...')

xgb = xgboost.XGBRegressor()
print('XGB Initiated')
xgb.fit(X_train, y_train)
print('XGB fitted')

y_pred = xgb.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
# mlflow.log_metric('Rmse', rmse)

In [12]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

In [6]:
xgb = xgboost.XGBRegressor()

In [8]:
xgb.fit(X_train, y_train)

In [17]:
Y_pred = xgb.predict(X_train)
rmse = mean_squared_error(Y_train, Y_pred, squared=False)
print(rmse)

23.3864374381039


In [24]:
Y_pred = xgb.predict(X_train)
rmse = mean_squared_error(Y_train, Y_pred, squared=False)
print(rmse)

126.35581662211472


In [10]:
Y_pred = xgb.predict(X_test)
rmse = mean_squared_error(y_test, Y_pred, squared=False)
print(rmse)

510.79901246982456


In [None]:
rf = RandomForestRegressor()
print('Instanntiated forest regressor')
rf.fit(X_train, y_train)

Instanntiated forest regressor


In [None]:
y_pred = rf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(rmse)