# Models

### Import Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import xgboost as xgb
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
import openpyxl
import os
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=UserWarning)
import pgeocode
from geopy.distance import geodesic
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb

from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import root_mean_squared_error

### Read the excel files
File including original Concentration, facilities details and distances

##### Read files

In [4]:
file_path = '/Users/leichen/Desktop/WasteWater/data/Trace Organics Full.xlsx'
all_sheets = pd.read_excel(file_path, sheet_name=None, skiprows=3)
all_data = []
sheets_to_exclude = ['Manhole 1', 'Manhole 2','Manhole 3']
file_path1 = '/Users/leichen/Desktop/WasteWater/data/Facilities.xlsx'
Facilities = pd.read_excel(file_path1, header=0)
#Facilities['Population']=Facilities['Population']/1000
Facilities['DailyFlow']=Facilities['DailyFlow']*3.78541
file_path2 = '/Users/leichen/Desktop/WasteWater/data/Analytes.xlsx'
Analytes = pd.read_excel(file_path2, header=0)
file_path3 = '/Users/leichen/Desktop/WasteWater/data/ZipCodes.xlsx'
Zips = pd.read_excel(file_path3,header=0)

##### Calculate the distance from the center of each facility to the center of Las Vegas

In [6]:
nomi = pgeocode.Nominatim('us')  # 'us' for United States
facilityList = []
distanceList = []
 
# Calculate the centroid (center of the group)
def calculate_centroid(coords):
    lats, longs = zip(*coords)
    return (np.mean(lats), np.mean(longs))

las_vegas_coords = (36.1699, -115.1398)
    
for k in Zips['Facility'].unique(): 
    zip_codes_group = Zips[Zips['Facility']==k]
    zip_group = [(nomi.query_postal_code(z).latitude, nomi.query_postal_code(z).longitude) for z in zip_codes_group['ZipCode']]
    centroid = calculate_centroid(zip_group)
    # globals()f'zip_codes_{k}' = zip_group
    distance_to_las_vegas = geodesic(centroid, las_vegas_coords).kilometers
    facilityList.append(k)
    distanceList.append(distance_to_las_vegas)
    zip_distances = pd.DataFrame({
        'Facility': facilityList,
        'Distance': distanceList
    })
    print(f"Centroid of Group {k}:", centroid)
    print(f"Distance between the two centroids: {distance_to_las_vegas:.3f} km")

Centroid of Group Facility_1: (36.10901304347825, -115.17888260869564)
Distance between the two centroids: 7.617 km
Centroid of Group Facility_2: (35.94515, -115.14779999999999)
Distance between the two centroids: 24.949 km
Centroid of Group Facility_3: (36.20858947368422, -115.23982105263157)
Distance between the two centroids: 9.968 km
Centroid of Group Facility_4A: (36.047666666666665, -114.94993333333333)
Distance between the two centroids: 21.822 km
Centroid of Group Facility_4B: (36.03556666666666, -115.069)
Distance between the two centroids: 16.212 km
Centroid of Group Facility_5: (36.27775, -115.1738625)
Distance between the two centroids: 12.353 km
Centroid of Group Facility_6: (35.9727, -114.8344)
Distance between the two centroids: 35.152 km
Centroid of Group Facility_4: (36.04161666666666, -115.00946666666668)
Distance between the two centroids: 18.448 km


##### Pre-process & merge the data files

In [8]:
for date, df in all_sheets.items():
    if date not in sheets_to_exclude:
        # Select the relevant columns: Analyte and Facilities (3rd column onwards)
        df_relevant = df.iloc[:, [1]+list(range(3,11))]  # 1st, 3rd to 10th columns
        
        # Rename the columns for ease of reference
        df_relevant.columns = ['Analyte', 'Facility_1', 'Facility_2', 'Facility_3', 'Facility_4', 'Facility_4A', 'Facility_4B', 'Facility_5', 'Facility_6']
        
        # Melt the DataFrame to convert wide format to long format
        df_melted = df_relevant.melt(id_vars=['Analyte'], var_name='Facility', value_name='Consumption')
        
        # Add the 'Date' column (sheet name, representing the date)
        df_melted['Date'] = date
        
        # Append the transformed DataFrame to the list
        all_data.append(df_melted)

# Concatenate all the DataFrames into one
final_df = pd.concat(all_data, ignore_index=True)
final_df['Consumption'] = final_df['Consumption'].astype(str)
final_df['Consumption']= pd.to_numeric(final_df['Consumption'].str.replace('<', ''), errors='coerce')
final_df['Date']=pd.to_datetime(final_df['Date'],format = '%m_%d_%y')
final_data = pd.merge(final_df,Facilities,on='Facility')
final_data = pd.merge(final_data, Analytes, on='Analyte',how='left')
final_data = pd.merge(final_data, zip_distances,on='Facility',how = 'left')
#final_data['Consump1']=(final_data['Consumption']*final_data['CF']*final_data['DailyFlow'])/(final_data['Population'])
final_data['Consump1']=(final_data['Consumption']*final_data['DailyFlow'])/(final_data['Population'])
final_data1 = final_data.copy()
final_data1.index = pd.to_datetime(final_data1['Date'])

##### Functions
1. create_lag: create previous consumption & consumption two months ago
2. create_features: date features

In [10]:
def create_lag(data, cov, lag):
    # Check if 'cov' is a single column or a list, and handle both cases
    if isinstance(cov, str):
        cov = [cov]  # Convert to a list if it's a single column name
    
    # Loop over the specified lags and create lagged columns
    for i in range(1, lag + 1):
        data[f'lag_{i}'] = data.groupby(cov)['Consump1'].shift(i)
    
    return data

In [12]:
def create_features(df):
    """
    Create time-based features such as hour of the day, day of the week, etc.
    """
    df.index = pd.to_datetime(df.index)
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar().week
    return df

##### Prepare the datasets for the model

In [14]:
data_input = create_lag(final_data1, ['Facility','Analyte'],1)
data_input = create_lag(final_data1, ['Facility','Analyte'],2)
data_input = create_features(data_input)
# data_input['Facility'] = data_input['Facility'].astype('category')
data_input['DetailedCat'] = data_input['DetailedCat'].astype('category')
data_input['Analyte'] = data_input['Analyte'].astype('category')
data_input= data_input.dropna(subset=['lag_1','lag_2'])

# d2 = data_input.sort_values(by=['Analyte', 'Facility'], ascending=[True, True])
# d2.to_excel('/Users/leichen/Desktop/WasteWater/results/2.xlsx', index=False)

##### Create functions
1. split_train_test : split the training datasets
2. x_y: give X_training, Y_training, X_test, Y_test
3. xgb_based: build base XGB model
4. naive_forecast: build naive model
5. plot_pred_true: plots compare predicted values vs True values
6. plot_difference: plots the difference between naive rmse - xgb rmse
7. xgb_method: general xgb method
8. lightGBM method: general lightGBM method

In [16]:
def split_train_test(df,percentage):
    train_size = int(len(df)*percentage)
    split_date = df.iloc[:train_size].index[-1]
    train = df.loc[df.index<= split_date]
    test = df.loc[df.index>split_date]
    return train,test

In [18]:
def x_y(df,pct,features, target):
    train,test = split_train_test(df,pct)
    X_train = train[features]
    y_train = train[target]
    X_test = test[features]
    y_test = test[target]
    return X_train,y_train, X_test, y_test

In [20]:
def xgb_based(df,a,f,features,target):
    reg = xgb.XGBRegressor(base_score=0.5,booster = 'gbtree',n_estimators=1000, early_stopping_rounds=50, max_depth=3, learning_rate=0.01, objective='reg:linear')
    reg.fit(X_train,y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
    test['xgb_pred'] = reg.predict(X_test)
    score = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
    return test, score  

In [22]:
def naive_forecast(train, test):
    # Forecast for each time step is simply the last value from the training data
    rmse_naive = np.sqrt(mean_squared_error(test['Consump1'], test['lag_1']))
    return rmse_naive

In [24]:
def xgb_method(X_train,y_train,X_test,y_test,features):
    model = xgb.XGBRegressor(enable_categorical=True, tree_method='gpu_hist' if 'category' in X_train.dtypes else 'auto')
    model.fit(X_train,y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
    return model

In [26]:
def LightGBM_method(X_train,y_train,X_test,y_test,categorical_features):
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

    # Define parameters for LightGBM
    params = {
        'objective': 'regression',  # Use 'regression' for predicting continuous values
        'metric': 'rmse',           # Use root mean squared error for evaluation
        'boosting_type': 'gbdt',
        'learning_rate': 0.1,
        'verbose': -1,
        }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, test_data],
        callbacks=[early_stopping(stopping_rounds=10), log_evaluation(10)]
        )
    return model

# Models

#### General Model

In [28]:
def methods(method,f_new,pct,categorical_features):
    features = ['month', 'quarter', 'weekofyear','lag_1','lag_2']
    features.extend(f_new)
    target = 'Consump1'
    
    data_input1=data_input.copy()
    train, test = split_train_test(data_input1,pct)
    X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
    score_naive = naive_forecast(train, test)
    if method == 'XGB':
        model = xgb_method(X_train,y_train,X_test,y_test,features)
    else:
        model = LightGBM_method(X_train,y_train,X_test,y_test,categorical_features)
    test['xgb_pred'] = model.predict(X_test) 
    score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
    df_rmse = pd.DataFrame({
        'Model':[method],
        'Features': '+'.join(features),
        'TrainingPct': [pct],
        'GB_rmse': [score_xgb],
        'Naive_rmse': [score_naive]
    })
    df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['GB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
    df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['GB_rmse']
    return df_rmse
    

In [30]:
methods('LightGBM',['DetailedCat','Distance'],.8,['DetailedCat'])

Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 6.43086	valid_1's rmse: 5.53805
[20]	training's rmse: 4.86706	valid_1's rmse: 3.84266
[30]	training's rmse: 4.41376	valid_1's rmse: 3.63784
[40]	training's rmse: 4.21612	valid_1's rmse: 3.63915
[50]	training's rmse: 4.04018	valid_1's rmse: 3.59954
Early stopping, best iteration is:
[48]	training's rmse: 4.07108	valid_1's rmse: 3.56538


Unnamed: 0,Model,Features,TrainingPct,GB_rmse,Naive_rmse,smaller,Difference
0,LightGBM,month+quarter+weekofyear+lag_1+lag_2+DetailedC...,0.8,3.565385,5.159748,xgb,1.594363


## XGB models

##### XGB with only detailed categories

In [93]:
# Initialize empty lists to store the values
xgb_list = []
naive_list = []
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','DetailedCat']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','DetailedCat','Analyte']
target = 'Consump1'
pct = 0.8

# data_input = data_input[data_input['Facility']!='Facility_4']

data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
model = xgb.XGBRegressor(enable_categorical=True, tree_method='gpu_hist' if 'category' in X_train.dtypes else 'auto')
model.fit(X_train,y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
test['xgb_pred'] = model.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
# df_rmse.to_excel('/Users/leichen/Desktop/WasteWater/results/rmse_comparison_Cat.xlsx', index=False)
df_rmse

[0]	validation_0-rmse:9.86033	validation_1-rmse:9.52471
[99]	validation_0-rmse:1.08062	validation_1-rmse:4.47275


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,4.472752,5.159748,xgb,0.686996


##### With Distance and DetailedCat as categorical features_XGB native support

In [94]:
# Initialize empty lists to store the values
xgb_list = []
naive_list = []
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Distance','DetailedCat']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Distance','DetailedCat','Analyte']
target = 'Consump1'
pct = 0.8

# data_input = data_input[data_input['Facility']!='Facility_4']

data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
model = xgb.XGBRegressor(enable_categorical=True, tree_method='gpu_hist' if 'category' in X_train.dtypes else 'auto')
model.fit(X_train,y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
test['xgb_pred'] = model.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
# df_rmse.to_excel('/Users/leichen/Desktop/WasteWater/results/rmse_comparison_Cat.xlsx', index=False)
df_rmse

[0]	validation_0-rmse:9.82153	validation_1-rmse:9.50522
[99]	validation_0-rmse:0.29051	validation_1-rmse:4.05391


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,4.053905,5.159748,xgb,1.105842


##### XGB method With Facility and DetailedCat as categorical features

In [103]:
# Initialize empty lists to store the values
xgb_list = []
naive_list = []
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Facility','DetailedCat']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Facility','DetailedCat','Analyte']
target = 'Consump1'
pct = 0.8

# data_input = data_input[data_input['Facility']!='Facility_4']
data_input['Facility'] = data_input['Facility'].astype('category')
data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
model = xgb.XGBRegressor(enable_categorical=True, tree_method='gpu_hist' if 'category' in X_train.dtypes else 'auto')
model.fit(X_train,y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
test['xgb_pred'] = model.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
# df_rmse.to_excel('/Users/leichen/Desktop/WasteWater/results/rmse_comparison_Cat.xlsx', index=False)
df_rmse

[0]	validation_0-rmse:9.73194	validation_1-rmse:9.55615
[99]	validation_0-rmse:0.24612	validation_1-rmse:3.50550


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,3.5055,5.159748,xgb,1.654248


## LightGBM

##### Light GBM with only detailedcategory

In [102]:
# categorical_features = ['DetailedCat']
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','DetailedCat']

# Initialize empty lists to store the values
xgb_list = []
naive_list = []
Analyte_list = []
categorical_features = ['DetailedCat','Analyte']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','DetailedCat','Analyte']
target = 'Consump1'
pct = 0.8

# data_input['Facility'] = data_input['Facility'].astype('category')
data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
# Convert data to LightGBM Dataset format and specify categorical features
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

# Define parameters for LightGBM
params = {
    'objective': 'regression',  # Use 'regression' for predicting continuous values
    'metric': 'rmse',           # Use root mean squared error for evaluation
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'verbose': -1,
}

model_lgb = lgb.train(
params,
train_data,
valid_sets=[train_data, test_data],
callbacks=[early_stopping(stopping_rounds=10), log_evaluation(10)]
)

test['xgb_pred'] = model_lgb.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
df_rmse

# # Evaluate the model
# rmse = root_mean_squared_error(y_test, y_pred, squared=False)
# print(f"Root Mean Squared Error: {rmse}")

Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 6.51479	valid_1's rmse: 5.5178
[20]	training's rmse: 4.99804	valid_1's rmse: 3.88937
[30]	training's rmse: 4.63214	valid_1's rmse: 3.78603
Early stopping, best iteration is:
[25]	training's rmse: 4.76169	valid_1's rmse: 3.76329


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,3.763289,5.159748,xgb,1.396459


##### LightGBM Facility & DetailedCat as features

In [105]:
categorical_features = ['Facility','DetailedCat']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Facility','DetailedCat']
# categorical_features = ['Facility','DetailedCat','Analyte']
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Facility','DetailedCat','Analyte']
# Initialize empty lists to store the values
xgb_list = []
naive_list = []
Analyte_list = []


target = 'Consump1'
pct = 0.8

# data_input['Facility'] = data_input['Facility'].astype('category')
data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
# Convert data to LightGBM Dataset format and specify categorical features
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

# Define parameters for LightGBM
params = {
    'objective': 'regression',  # Use 'regression' for predicting continuous values
    'metric': 'rmse',           # Use root mean squared error for evaluation
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'verbose': -1,
}

model_lgb = lgb.train(
params,
train_data,
valid_sets=[train_data, test_data],
callbacks=[early_stopping(stopping_rounds=10), log_evaluation(10)]
)

test['xgb_pred'] = model_lgb.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
df_rmse

# # Evaluate the model
# rmse = root_mean_squared_error(y_test, y_pred, squared=False)
# print(f"Root Mean Squared Error: {rmse}")

Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 6.51479	valid_1's rmse: 5.5178
[20]	training's rmse: 4.87278	valid_1's rmse: 3.81983
[30]	training's rmse: 4.43335	valid_1's rmse: 3.53355
[40]	training's rmse: 4.23491	valid_1's rmse: 3.46769
[50]	training's rmse: 4.10081	valid_1's rmse: 3.45095
[60]	training's rmse: 3.98453	valid_1's rmse: 3.49308
Early stopping, best iteration is:
[53]	training's rmse: 4.07319	valid_1's rmse: 3.42535


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,3.425354,5.159748,xgb,1.734394


##### LightGBM with distance and detailedcat

In [101]:
# categorical_features = ['DetailedCat']
categorical_features = ['DetailedCat','Analyte']
# Initialize empty lists to store the values
xgb_list = []
naive_list = []
Analyte_list = []
# features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Distance','DetailedCat']
features = ['month', 'quarter', 'weekofyear','lag_1','lag_2','Distance','DetailedCat','Analyte']
target = 'Consump1'
pct = 0.8

data_input['Facility'] = data_input['Facility'].astype('category')
data_input1=data_input.copy()
train, test = split_train_test(data_input1,pct)
X_train,y_train, X_test, y_test = x_y(data_input1,pct,features,target)
# Convert data to LightGBM Dataset format and specify categorical features
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

# Define parameters for LightGBM
params = {
    'objective': 'regression',  # Use 'regression' for predicting continuous values
    'metric': 'rmse',           # Use root mean squared error for evaluation
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'verbose': -1,
}

model_lgb = lgb.train(
params,
train_data,
valid_sets=[train_data, test_data],
callbacks=[early_stopping(stopping_rounds=10), log_evaluation(10)]
)

test['xgb_pred'] = model_lgb.predict(X_test)
score_xgb = np.sqrt(mean_squared_error(test['Consump1'], test['xgb_pred']))
score_naive = naive_forecast(train, test)
naive_list.append(score_naive)
xgb_list.append(score_xgb)
df_rmse = pd.DataFrame({
    'XGB_rmse': xgb_list,
    'Naive_rmse': naive_list
})
df_rmse['smaller'] = df_rmse.apply(lambda row: 'xgb' if row['XGB_rmse'] < row['Naive_rmse'] else 'naive', axis=1)
df_rmse['Difference'] = df_rmse['Naive_rmse']-df_rmse['XGB_rmse']
df_rmse

# # Evaluate the model
# rmse = root_mean_squared_error(y_test, y_pred, squared=False)
# print(f"Root Mean Squared Error: {rmse}")

Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 6.42971	valid_1's rmse: 5.54192
[20]	training's rmse: 4.8333	valid_1's rmse: 3.81498
[30]	training's rmse: 4.36723	valid_1's rmse: 3.54509
[40]	training's rmse: 4.183	valid_1's rmse: 3.57063
Early stopping, best iteration is:
[31]	training's rmse: 4.34326	valid_1's rmse: 3.51961


Unnamed: 0,XGB_rmse,Naive_rmse,smaller,Difference
0,3.519609,5.159748,xgb,1.640139
