In [1]:
!python -V

Python 3.9.21


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso

In [5]:
from sklearn.metrics import mean_squared_error,root_mean_squared_error
import math
from sklearn.feature_extraction import DictVectorizer

In [6]:
!pip install pyarrow



In [7]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")
#mlflow.set_experiment("nyc-taxi-experiment2")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1742825619994, experiment_id='2', last_update_time=1742825619994, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
def read_dataframe(filename):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime-df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda x:x.total_seconds()/60)
    df=df[(df.duration>=1) & (df.duration<=60)]
    categorical=['PULocationID', 'DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df
   

In [9]:
df_train=read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('./data/green_tripdata_2021-02.parquet')

In [10]:
len(df_train),len(df_val)

(73908, 61921)

In [11]:
df_train['PU_DO']=df_train['PULocationID']+"_"+df_train['DOLocationID']
df_val['PU_DO']=df_val['PULocationID']+"_"+df_val['DOLocationID']

In [12]:
categorical=['PU_DO']#'PULocationID', 'DOLocationID']
numerical=['trip_distance']
dv=DictVectorizer()
train_dicts=df_train[categorical+numerical].to_dict(orient="records")
X_train=dv.fit_transform(train_dicts)

val_dicts=df_val[categorical+numerical].to_dict(orient="records")
X_val=dv.transform(val_dicts)

In [13]:
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

In [14]:
lr=LinearRegression()
lr.fit(X_train, y_train)
y_pred=lr.predict(X_val)
root_mean_squared_error(y_val,y_pred)

7.758715208009878

In [15]:
with open("./models/lin_reg.bin","wb") as fout:
    pickle.dump((dv,lr),fout)

In [19]:
with mlflow.start_run():

    mlflow.set_tag("developer","mudathir")
    mlflow.log_param("train_data_path","./data/green_tripdata_2021-01.parquet")
    mlflow.log_param("validation_data_path","./data/green_tripdata_2021-02.parquet")
    alpha=0.001
    mlflow.log_param("alpha",alpha)
    lr=Lasso(alpha)
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_val)
    rmse=root_mean_squared_error(y_val,y_pred)
    mlflow.log_metric("rmse",rmse)
    with open("./models/lin_reg_new.bin","wb") as fout:
        pickle.dump((dv,lr),fout)
    mlflow.log_artifact(local_path="./models/lin_reg_new.bin", artifact_path="models_pickle")

In [15]:
import xgboost as xgb
from hyperopt import hp,fmin,tpe,Trials, STATUS_OK
from hyperopt.pyll import scope

In [16]:
train=xgb.DMatrix(X_train, label=y_train)
valid=xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model","xgboost_v2")
        mlflow.log_params(params)
        booster=xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
        y_pred=booster.predict(valid)
        rmse=root_mean_squared_error(y_val,y_pred)
        mlflow.log_metric("rmse",rmse)
        return {'loss': rmse, 'status':STATUS_OK}


In [20]:
search_space={
    "max_depth": scope.int(hp.quniform("max_depth",4,100,1)),
    "learning_rate": hp.loguniform("learning_rate",-3,0),
    "reg_alpha": hp.loguniform("reg_alpha",-5,-1),
    "reg_lambda": hp.loguniform("reg_lambda",-6,-1),
    "min_child_weight": hp.loguniform("min_child_weight",-1,3),
    "objective":'reg:linear',
    "seed":42
}

best_result=fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials())

In [21]:
best_params={"learning_rate":
0.09014190119550018,"max_depth":81,"min_child_weight":
2.0452967381046134,"objective":"reg:linear",
"reg_alpha":0.07820874638976405,
"reg_lambda":0.08033992266454794,"seed":42}

In [None]:
mlflow.xgboost.autolog()
booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )

2025/03/25 11:37:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f51bf619b2dd443887259f966564f674', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


In [None]:
mlflow.xgboost.autolog(disable=True)

In [18]:
from mlflow import MlflowClient
mlflow_tracking_uri="sqlite:///mlflow.db"
client=MlflowClient(mlflow_tracking_uri)

In [19]:
desired_run=client.get_run("707232543b814e74aca48be6c822f9af")
desired_run_params=desired_run.data.params
print(desired_run_params)
with mlflow.start_run():
    mlflow.set_tag("model","xgboost3")
    mlflow.log_params(desired_run_params)
    booster=xgb.train(
            params=desired_run_params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
    y_pred=booster.predict(valid)
    rmse=root_mean_squared_error(y_val,y_pred)
    mlflow.log_metric("rmse",rmse)
    with open("models/preprocessor.b","wb") as fout:
         pickle.dump(dv, fout)
    mlflow.log_artifact("models/preprocessor.b", "preprocessor")
    mlflow.xgboost.log_model(booster,artifact_path="models_mlflow")


{'learning_rate': '0.46400666326762463', 'max_depth': '70', 'min_child_weight': '3.7768059476285445', 'objective': 'reg:linear', 'reg_alpha': '0.02560203077527551', 'reg_lambda': '0.0027861679673304076', 'seed': '42'}




[0]	validation-rmse:8.74318
[1]	validation-rmse:7.35867
[2]	validation-rmse:6.84527
[3]	validation-rmse:6.65016
[4]	validation-rmse:6.56333
[5]	validation-rmse:6.51747
[6]	validation-rmse:6.49565
[7]	validation-rmse:6.48048
[8]	validation-rmse:6.46901
[9]	validation-rmse:6.46176
[10]	validation-rmse:6.45761
[11]	validation-rmse:6.45307
[12]	validation-rmse:6.44842
[13]	validation-rmse:6.44441
[14]	validation-rmse:6.43924
[15]	validation-rmse:6.43646
[16]	validation-rmse:6.43316
[17]	validation-rmse:6.43058
[18]	validation-rmse:6.42857
[19]	validation-rmse:6.42638
[20]	validation-rmse:6.42000
[21]	validation-rmse:6.41924
[22]	validation-rmse:6.41659
[23]	validation-rmse:6.41574
[24]	validation-rmse:6.41517
[25]	validation-rmse:6.41249
[26]	validation-rmse:6.41139
[27]	validation-rmse:6.41021
[28]	validation-rmse:6.40816
[29]	validation-rmse:6.40669
[30]	validation-rmse:6.40535
[31]	validation-rmse:6.40454
[32]	validation-rmse:6.40363
[33]	validation-rmse:6.40188
[34]	validation-rmse:6.4



In [25]:
with mlflow.start_run():
    best_params={"learning_rate":
        0.09014190119550018,"max_depth":81,"min_child_weight":
        2.0452967381046134,"objective":"reg:linear",
        "reg_alpha":0.07820874638976405,
        "reg_lambda":0.08033992266454794,"seed":42}
    mlflow.log_params(best_params)
    booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
    y_pred=booster.predict(valid)
    rmse=root_mean_squared_error(y_val,y_pred)
    mlflow.log_metric("rmse",rmse)
    with open("models/preprocessor.b","wb") as fout:
        pickle.dump(dv, fout)
    mlflow.log_artifact("models/preprocessor.b", "preprocessor")
    mlflow.xgboost.log_model(booster,artifact_path="models_mlflow")




[0]	validation-rmse:11.47766
[1]	validation-rmse:10.82777
[2]	validation-rmse:10.25528
[3]	validation-rmse:9.75138
[4]	validation-rmse:9.31152
[5]	validation-rmse:8.92713
[6]	validation-rmse:8.59196
[7]	validation-rmse:8.30042
[8]	validation-rmse:8.04990
[9]	validation-rmse:7.83213
[10]	validation-rmse:7.64566
[11]	validation-rmse:7.48518
[12]	validation-rmse:7.34754
[13]	validation-rmse:7.22885
[14]	validation-rmse:7.12675
[15]	validation-rmse:7.03905
[16]	validation-rmse:6.96432
[17]	validation-rmse:6.89944
[18]	validation-rmse:6.84370
[19]	validation-rmse:6.79371
[20]	validation-rmse:6.75060
[21]	validation-rmse:6.71443
[22]	validation-rmse:6.68179
[23]	validation-rmse:6.65457
[24]	validation-rmse:6.63091
[25]	validation-rmse:6.60932
[26]	validation-rmse:6.59090
[27]	validation-rmse:6.57441
[28]	validation-rmse:6.55938
[29]	validation-rmse:6.54532
[30]	validation-rmse:6.53304
[31]	validation-rmse:6.52349
[32]	validation-rmse:6.51423
[33]	validation-rmse:6.50551
[34]	validation-rmse:



In [27]:
logged_model = 'runs:/e52d64c57a854c83b29bb398265fb3de/models_mlflow'
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model_xgboost=mlflow.xgboost.load_model(logged_model)
# Predict on a Pandas DataFrame.
y_pred=loaded_model_xgboost.predict(valid)

In [29]:
print(y_pred[:5])
print(y_val[:5])

[14.159118  6.87823  13.207436 25.13605   9.614906]
[17.91666667  6.5        15.25       18.23333333  8.96666667]


HW Solution