In [44]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,root_mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np

import pandas as pd
import os

In [45]:
#Importing the data set
os.chdir("D:/Neeraj_Dixit/Projects/Calorie_pred/")
df=pd.read_csv("Cleaned_data.csv")


In [46]:
X=df.iloc[:,[0,1,2,3,4,5,6,7]]
y=df.iloc[:,[-1]]

In [47]:
#To avoid negative values

#y_log = np.log1p(y)  # log(1 + y)
#model.fit(X, y_log)

# Then invert the transformation after prediction
#y_pred_log = model.predict(X_test)
#y_pred = np.expm1(y_pred_log)  # exp(y) - 1

In [48]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [49]:
def evaluate_model(y_test,y_pred):
    mae=mean_absolute_error(y_test,y_pred)
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    r_sq=r2_score(y_test,y_pred)
    rmsle=root_mean_squared_log_error(y_test,y_pred)
    return mae,mse,rmse,r_sq,rmsle

In [60]:
models = {
    "Xgboost_regressor":XGBRegressor()
}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #making predictions
    y_train_prd=model.predict(X_train)
    y_test_prd=model.predict(X_test)
    #Handling negative values
    y_train_pred=np.maximum(y_train_prd,0.9)
    y_test_pred=np.maximum(y_test_prd,0.9)
    #Evaluate Train and test set 
    train_mae,train_mse,train_rmse,train_r2,train_rmsle=evaluate_model(y_train,y_train_pred)
    test_mae,test_mse,test_rmse,test_r2,test_rmsle=evaluate_model(y_test,y_test_pred)

    print(list(models.keys()))

    print("Model Performance for training Set")
    print("- Mean absolute error: {:.4f}".format(train_mae))
    print("- Mean squared error: {:.4f}".format(train_mse))
    print("- Root mean squared error: {:.4f}".format(train_rmse))
    print("- R2 Score: {:.4f}".format(train_r2))
    print("- Mean squared log error: {:.4f}".format(train_rmsle))
    print("----------------------------------------")

    print("Model Performance for Test Set")
    print("- Mean absolute error: {:.4f}".format(test_mae))
    print("- Mean squared error: {:.4f}".format(test_mse))
    print("- Root mean squared error: {:.4f}".format(test_rmse))
    print("- R2 Score: {:.4f}".format(test_r2))
    print("- Mean squared log error: {:.4f}".format(test_rmsle))
    print("----------------------------------------")

['Xgboost_regressor']
Model Performance for training Set
- Mean absolute error: 2.3041
- Mean squared error: 12.6350
- Root mean squared error: 3.5546
- R2 Score: 0.9968
- Mean squared log error: 0.0620
----------------------------------------
Model Performance for Test Set
- Mean absolute error: 2.3592
- Mean squared error: 14.6073
- Root mean squared error: 3.8220
- R2 Score: 0.9962
- Mean squared log error: 0.0651
----------------------------------------


In [62]:
df2=pd.read_csv("testing_data.csv")
df3=pd.read_csv("id_file.csv")
y_test_pred=model.predict(df2)
y_test_pred=np.maximum(y_test_pred,0.9)
test_file=pd.DataFrame(y_test_pred)
final=pd.concat([df3,test_file],axis=1)
final.to_csv("prediction2.csv",index=False)

In [63]:
#Hyperparameter Tuning regaring XGboost
xgboost_params={"learning_rate":[0.1,0.01],
                "max_depth":[5,8,12,20,30],
                "n_estimators":[100,200,300],
                "colsample_bytree":[0.5,0.8,1,0.3,0.4]}

In [64]:
#Hyperparameter tuning 
randomcv_model=[('XGboost',XGBRegressor(),xgboost_params)]

In [None]:
from sklearn.model_selection import c RandomizedSearchCV

model_param = {}

# Assuming randomcv_model is a list of tuples: (name, model, param_grid)
for name, model, params in randomcv_model:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        verbose=2,
        n_jobs=-1,
        cv=5,  # added cross-validation
        scoring='accuracy'  # or another appropriate metric
    )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

# Display best hyperparameters for each model
for model_name in model_param:
    print(f"-------------------------- Best params for model {model_name} ----------------------------")
    print(model_param[model_name])


Fitting 5 folds for each of 100 candidates, totalling 500 fits
