In [5]:
import pickle
import pandas as pd

In [33]:
from sklearn.metrics import root_mean_squared_error

In [3]:
file_path = 'models/lin_reg.bin'

try:
    with open(file_path, 'rb') as file_in: # 'rb' for read binary
        # Load the objects. pickle.load() will return exactly what you dumped.
        # Since we dumped a tuple (dv, lr), it will return that tuple.
        dv, lr = pickle.load(file_in)

    # Now you can use dv_loaded and lr_loaded
    print("Objects loaded successfully!")
    print("Type of dv_loaded:", type(dv))
    print("Type of lr_loaded:", type(lr))

    # Example: If dv_loaded is a DictVectorizer, you might check its vocabulary
    # if hasattr(dv_loaded, 'vocabulary_'):
    #     print("dv_loaded vocabulary size:", len(dv_loaded.vocabulary_))

    # Example: If lr_loaded is a scikit-learn model, you might check its coefficients
    # if hasattr(lr_loaded, 'coef_'):
    #     print("lr_loaded coefficients:", lr_loaded.coef_)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Make sure the path is correct.")
except pickle.UnpicklingError:
    print(f"Error: Could not unpickle the file '{file_path}'. It might be corrupted or not a pickle file.")
except ImportError as e:
    print(f"Error: A class definition needed for unpickling is missing: {e}")
    print("Ensure all necessary libraries (e.g., scikit-learn) are installed and imported.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Objects loaded successfully!
Type of dv_loaded: <class 'sklearn.feature_extraction._dict_vectorizer.DictVectorizer'>
Type of lr_loaded: <class 'sklearn.linear_model._base.LinearRegression'>


In [6]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df
df_train = read_dataframe('../data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2021-02.parquet')
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [7]:
df_train = read_dataframe('../data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2021-02.parquet')

In [None]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
# X_train = dv.fit_transform(train_dicts) # dv is already fit / loaded from pickled file
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [29]:
try: 
    X_train, y_train, X_val, y_val, dv, lr
    print('all objects loaded successfully')
except NameError: print('objects missing')

all objects loaded successfully


In [44]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

LOGGING

In [42]:
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc_taxi_experiment_2_4')

import xgboost as xgb

In [51]:
import mlflow.xgboost

with mlflow.start_run():

    # best_params = {
    #     "learning_rate": 0.06250345613117511,
    #     "max_depth": 43,
    #     "min_child_weight": 1.4898346923239079,
    #     "objective": "reg:linear",
    #     "reg_alpha": 0.013267037520848521,
    #     "reg_lambda": 0.005200771650703148,
    #     "seed": 42
    #     }
    
    best_params = {
    "learning_rate": 0.17846516109979893,
    "max_depth": 43,
    "min_child_weight": 1.3599886311054774,
    "objective": "reg:linear",
    "reg_alpha": 0.10102327187602764,
    "reg_lambda": 0.33287636923295955,
    "seed": 42
}

    mlflow.log_params(best_params)

    # mlflow.xgboost.autolog()

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,   
        evals=[(valid, 'validation')],
        early_stopping_rounds=50)
    
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)

    mlflow.log_metric('rmse', rmse)

    with open('models/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')

    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')



[0]	validation-rmse:10.79100
[1]	validation-rmse:9.70032
[2]	validation-rmse:8.87296
[3]	validation-rmse:8.25867
[4]	validation-rmse:7.80031
[5]	validation-rmse:7.46694
[6]	validation-rmse:7.22031
[7]	validation-rmse:7.03847
[8]	validation-rmse:6.90785
[9]	validation-rmse:6.80955
[10]	validation-rmse:6.73655
[11]	validation-rmse:6.68127
[12]	validation-rmse:6.64016
[13]	validation-rmse:6.60587
[14]	validation-rmse:6.57795
[15]	validation-rmse:6.55628
[16]	validation-rmse:6.54059
[17]	validation-rmse:6.52702
[18]	validation-rmse:6.51484
[19]	validation-rmse:6.50552
[20]	validation-rmse:6.49747
[21]	validation-rmse:6.49167
[22]	validation-rmse:6.48647
[23]	validation-rmse:6.48204
[24]	validation-rmse:6.47747
[25]	validation-rmse:6.47459
[26]	validation-rmse:6.47236
[27]	validation-rmse:6.47108
[28]	validation-rmse:6.46950
[29]	validation-rmse:6.46756
[30]	validation-rmse:6.46562
[31]	validation-rmse:6.46331
[32]	validation-rmse:6.46156
[33]	validation-rmse:6.45992
[34]	validation-rmse:6.



From Artifact:

In [52]:
# import mlflow
logged_model = 'runs:/3eb110884b3a47fa89c621fac277ba3b/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Predict on a Pandas DataFrame.
# import pandas as pd
# loaded_model.predict(pd.DataFrame(data))



In [53]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 3eb110884b3a47fa89c621fac277ba3b

In [None]:
xgboost_model = mlflow.xgboost.load_model(logged_model) #logged_model is the URI logged_model = 'runs:/3eb110884b3a47fa89c621fac277ba3b/models_mlflow'



In [55]:
xgboost_model

<xgboost.core.Booster at 0x18de275e0>

In [56]:
y_pred = xgboost_model.predict(valid)

In [57]:
y_pred

array([14.422415 ,  7.131102 , 14.262912 , ..., 13.464924 ,  6.0386367,
        8.0312605], dtype=float32)