In [1]:

#!pip uninstall numpy -y     # uninstall existing numpy
#!pip install "numpy<2.0"

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.feature_extraction import DictVectorizer
import pickle

In [2]:
#!pip install mlflow

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("NYCtaxi-ride-duration")




<Experiment: artifact_location='mlflow-artifacts:/965139444109075764', creation_time=1727092910601, experiment_id='965139444109075764', last_update_time=1727092910601, lifecycle_stage='active', name='NYCtaxi-ride-duration', tags={}>

In [3]:
def read_dataframe(filename):
    
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    
    df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60.0
    
    df = df[(df.duration > 0) & (df.duration <= 60)]
    
    df['pickup_hour'] = df['lpep_pickup_datetime'].dt.hour
    df['pickup_day'] = df['lpep_pickup_datetime'].dt.dayofweek
    
    features = ['trip_distance', 'pickup_hour', 'pickup_day', 'PULocationID', 'DOLocationID','total_amount','duration']
    
    
    return df[features]
    
    

In [4]:
df_train = read_dataframe('/workspaces/MLOps-TaxiTrpDuration/green_tripdata_2021-01.csv')
df_val = read_dataframe('/workspaces/MLOps-TaxiTrpDuration/green_tripdata_2021-02.csv')

  df = pd.read_csv(filename)


In [5]:
X_train = df_train.iloc[:, :-1]
y_train = df_train['duration']

X_val = df_val.iloc[:, :-1]
y_val = df_val['duration']

In [6]:
### training data

# Convert the DataFrame to a list of dictionaries
X_dict = X_train.to_dict(orient='records')

# Initialize the DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

# Fit and transform the data
X_encoded_dict = dict_vectorizer.fit_transform(X_dict)


### Validation data

X_val_dict = X_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_val_encoded_dict = dict_vectorizer.fit_transform(X_val_dict)



In [8]:
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_encoded_dict, y_train)

#Predict and evaluate the Linear Regression model
y_pred_lr = lr_model.predict(X_val_encoded_dict)

rmse = float(mean_squared_error(y_val, y_pred_lr, squared=False))



In [9]:
rmse

7.701738647933279

In [10]:
# save the models
with open('/workspaces/MLOps-TaxiTrpDuration/models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr_model), f_out)

In [9]:
with mlflow.start_run(run_name="linear_regression"):
    mlflow.set_tag("developer:", "nithin")
    mlflow.log_param("train-data", "green_tripdata_2021-01.csv")
    mlflow.log_param("val-data", "green_tripdata_2021-02.csv")

    mlflow.log_metric("mean_squared_error" , rmse)

    mlflow.log_artifact(local_path="/workspaces/MLOps-TaxiTrpDuration/models/lin_reg.bin", artifact_path="models_pickle")

2024/10/15 07:47:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run linear_regression at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/623ebf10250a4eb5bb90d49a9ca94c2e.
2024/10/15 07:47:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.


In [7]:
#! pip install xgboost
import xgboost as xgb

In [8]:
#!pip install hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [9]:
#train = xgb.DMatrix(X_train, label=y_train)
#valid = xgb.DMatrix(X_val, label=y_val)

train = xgb.DMatrix(X_encoded_dict, label=y_train)
valid = xgb.DMatrix(X_val_encoded_dict, label=y_val)

In [13]:
# Define the objective function for Hyperopt
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        #  Predict on the test set
        y_pred = booster.predict(valid)

        # Calculate RMSE
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [14]:
# Define the hyperparameter search space
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

# Run the optimization
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=Trials()
)



  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:11.82443                         
[1]	validation-rmse:11.36217                         
[2]	validation-rmse:10.93167                         
[3]	validation-rmse:10.53289                         
[4]	validation-rmse:10.16068                         
[5]	validation-rmse:9.81231                          
[6]	validation-rmse:9.48856                          
[7]	validation-rmse:9.18856                          
[8]	validation-rmse:8.91089                          
[9]	validation-rmse:8.65300                          
[10]	validation-rmse:8.41277                         
[11]	validation-rmse:8.19127                         
[12]	validation-rmse:7.98806                         
[13]	validation-rmse:7.79969                         
[14]	validation-rmse:7.62407                         
[15]	validation-rmse:7.46396                         
[16]	validation-rmse:7.31498                         
[17]	validation-rmse:7.17848                         
[18]	validation-rmse:7.05334


2024/10/15 07:47:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run puzzled-panda-334 at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/fa159a660f754c698b55983289c4455f.

2024/10/15 07:47:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.



 33%|███▎      | 1/3 [00:09<00:18,  9.31s/trial, best loss: 5.478988060188435]




[0]	validation-rmse:6.99778                                                   
[1]	validation-rmse:7.08041                                                   
[2]	validation-rmse:7.12651                                                   
[3]	validation-rmse:7.14063                                                   
[4]	validation-rmse:7.14257                                                   
[5]	validation-rmse:7.14378                                                   
[6]	validation-rmse:7.14480                                                   
[7]	validation-rmse:7.14597                                                   
[8]	validation-rmse:7.14591                                                   
[9]	validation-rmse:7.14589                                                   
 33%|███▎      | 1/3 [00:12<00:18,  9.31s/trial, best loss: 5.478988060188435]


2024/10/15 07:47:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-quail-81 at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/defbab2f09c5456b803f4d668f5ce3c4.

2024/10/15 07:47:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.



 67%|██████▋   | 2/3 [00:12<00:05,  5.90s/trial, best loss: 5.478988060188435]




[0]	validation-rmse:11.15351                                                  
[1]	validation-rmse:10.18068                                                  
[2]	validation-rmse:9.36289                                                   
[3]	validation-rmse:8.68828                                                   
[4]	validation-rmse:8.12586                                                   
[5]	validation-rmse:7.66435                                                   
[6]	validation-rmse:7.28972                                                   
[7]	validation-rmse:6.99095                                                   
[8]	validation-rmse:6.74443                                                   
[9]	validation-rmse:6.54686                                                   
[10]	validation-rmse:6.38843                                                  
[11]	validation-rmse:6.26286                                                  
[12]	validation-rmse:6.16181                        


2024/10/15 07:47:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-crow-759 at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/e7f3d6acfafe4b09bd966f233c60cbc3.

2024/10/15 07:47:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.



100%|██████████| 3/3 [00:25<00:00,  8.35s/trial, best loss: 5.478988060188435]


In [22]:
# train with best parameter

mlflow.xgboost.autolog(disable=True)
mlflow.end_run()
with mlflow.start_run(run_name="best-of-all"):

    
    
    best_params = {
        "min_child_weight" : 15.768193366204851,
        "max_depth": 32,
        "objective": 'reg:linear',
        "reg_alpha" : 0.04944253425989358,
        "seed": 42,
        "reg_lambda": 0.008692830225124612,
        "learning_rate": 0.06790333875985635
    }
    
    mlflow.log_params(best_params)

    booster = xgb.train(
                params=best_params,
                dtrain=train,
                num_boost_round=100,
                evals=[(valid, 'validation')],
                early_stopping_rounds=10
            )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)


# save preprocessor in local repo( here it is dictionary vectorizer)

    with open("/workspaces/MLOps-TaxiTrpDuration/models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

# save dictionary vectorizer as artifact to mlflow
    mlflow.log_artifact("/workspaces/MLOps-TaxiTrpDuration/models/preprocessor.b", artifact_path="preprocessor")

# save the xgboost model 
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")


2024/10/15 08:04:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run persistent-crane-66 at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/317ff7f840e943dbb2d9bd12359c94df.
2024/10/15 08:04:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.


[0]	validation-rmse:11.71455
[1]	validation-rmse:11.16001




[2]	validation-rmse:10.65057
[3]	validation-rmse:10.18475
[4]	validation-rmse:9.75834
[5]	validation-rmse:9.36743
[6]	validation-rmse:9.01312
[7]	validation-rmse:8.68849
[8]	validation-rmse:8.39393
[9]	validation-rmse:8.12393
[10]	validation-rmse:7.88188
[11]	validation-rmse:7.66182
[12]	validation-rmse:7.46320
[13]	validation-rmse:7.28012
[14]	validation-rmse:7.11633
[15]	validation-rmse:6.96672
[16]	validation-rmse:6.83364
[17]	validation-rmse:6.71377
[18]	validation-rmse:6.60284
[19]	validation-rmse:6.50461
[20]	validation-rmse:6.41383
[21]	validation-rmse:6.33371
[22]	validation-rmse:6.26205
[23]	validation-rmse:6.19772
[24]	validation-rmse:6.13672
[25]	validation-rmse:6.08318
[26]	validation-rmse:6.03157
[27]	validation-rmse:5.98895
[28]	validation-rmse:5.95000
[29]	validation-rmse:5.91232
[30]	validation-rmse:5.87718
[31]	validation-rmse:5.84704
[32]	validation-rmse:5.81888
[33]	validation-rmse:5.79194
[34]	validation-rmse:5.76801
[35]	validation-rmse:5.74433
[36]	validation-rmse

2024/10/15 08:04:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run best-of-all at: http://127.0.0.1:5000/#/experiments/965139444109075764/runs/49cde9daddef4338a3a23a12f1283a26.
2024/10/15 08:04:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/965139444109075764.


In [10]:
#prediction

#logged_model = 'runs:/1908366088af476ab4bd2dfc733e4311/models_mlflow'
logged_model = 'runs:/c36592810dac4653bc0dfd44374a3712/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: c36592810dac4653bc0dfd44374a3712

In [6]:
#load as xgboost_model
xgboost_model = mlflow.xgboost.load_model(logged_model)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [17]:
# testing the prediction
y_pred1 = xgboost_model.predict(valid)
y_pred1

array([20.39988  ,  6.1361213, 14.035301 , ..., 11.471176 ,  9.013139 ,
        6.6934114], dtype=float32)