In [2]:
import pandas as pd


In [3]:
df = pd.read_csv(r"E:\DATA SCIENCE\Datasets\Coffee Sales\Coffe_sales.csv")

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
selected_columns = [
    'hour_of_day', 'money', 'Weekday', 'Month_name',
    'Time_of_Day', 'cash_type', 'coffee_name'
]
df_selected = df[selected_columns].copy()


In [6]:
df_selected.head()

Unnamed: 0,hour_of_day,money,Weekday,Month_name,Time_of_Day,cash_type,coffee_name
0,10,38.7,Fri,Mar,Morning,card,Latte
1,12,38.7,Fri,Mar,Afternoon,card,Hot Chocolate
2,12,38.7,Fri,Mar,Afternoon,card,Hot Chocolate
3,13,28.9,Fri,Mar,Afternoon,card,Americano
4,13,38.7,Fri,Mar,Afternoon,card,Latte


In [7]:
# Features and target
x = df_selected.drop("money", axis=1)
y = df_selected["money"]

In [8]:
num_features = ["hour_of_day"]   # only numeric input feature
ohe_features = ["Weekday", "Month_name", "coffee_name"]
ordinal_features = ["Time_of_Day"]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



In [10]:
time_categories = [["Morning", "Afternoon", "Evening","Night"]]

# Transformers
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_ohe = Pipeline(steps=[
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

categorical_ord = Pipeline(steps=[
    ("ordinal", OrdinalEncoder(categories=time_categories))
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("ohe", categorical_ohe, ohe_features),
        ("ord", categorical_ord, ordinal_features)
    ]
)

In [12]:
# print(preprocessor)

In [13]:

# X_transformed = preprocessor.fit_transform(X)

# # Get feature names after transformation
# feature_names = preprocessor.get_feature_names_out()

# # Convert to DataFrame
# X_transformed_df = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, "toarray") else X_transformed,
#                                 columns=feature_names)

# # Add target
# X_transformed_df["money"] = y.values

# # Correlation
# correlation = X_transformed_df.corr()["money"].sort_values(ascending=False)
# print(correlation)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])


In [16]:
# Gradient Boosting
gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=100, random_state=42))
])


In [17]:
# Decision Tree
dt_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])


In [18]:
# AdaBoost
ab_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", AdaBoostRegressor(n_estimators=100, random_state=42))
])


In [19]:
# XGBoost
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=100, random_state=42))
])


In [20]:
# LightGBM (faster boosting, needs lightgbm installed)
lgbm_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(n_estimators=100, random_state=42))
])


In [21]:
rf_model.fit(x_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ohe', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Morning', 'Afternoon', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def regression_metrics(y_true, y_pred, X_test=None):
    """
    Prints R², Adjusted R² (if X_test provided), MAE, RMSE, and MAPE.
    """
    # R²
    r2 = r2_score(y_true, y_pred)
    
    # Adjusted R² (needs X_test for feature count)
    adj_r2 = None
    if X_test is not None:
        n = len(y_true)
        p = X_test.shape[1]
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    # MAE
    mae = mean_absolute_error(y_true, y_pred)
    
    # RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    # MAPE (handle zeros carefully)
    with np.errstate(divide='ignore', invalid='ignore'):
        mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
    
    print(f"R² Score:       {r2:.4f}")
    if adj_r2 is not None:
        print(f"Adjusted R²:    {adj_r2:.4f}")
    print(f"MAE:            {mae:.4f}")
    print(f"RMSE:           {rmse:.4f}")
    print(f"MAPE (%):       {mape:.2f}")




In [23]:
# After training and predicting
y_pred = rf_model.predict(x_test)

regression_metrics(y_test, y_pred, x_test)


R² Score:       0.9780
Adjusted R²:    0.9778
MAE:            0.2630
RMSE:           0.7100
MAPE (%):       0.84


In [27]:
models = {
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "AdaBoost": ab_model,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model
}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)  
    print(f"{name} R² Score: {r2_score(y_test, y_pred):.4f}")


Decision Tree R² Score: 0.9635
Random Forest R² Score: 0.9780
Gradient Boosting R² Score: 0.9761
AdaBoost R² Score: 0.8361
XGBoost R² Score: 0.9775
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 2837, number of used features: 29
[LightGBM] [Info] Start training from score 31.608903
LightGBM R² Score: 0.9773




In [30]:
xgb_model.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ohe', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Morning', 'Afternoon', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
xgb_pred = xgb_model.predict(x_train)

regression_metrics(y_test, xgb_pred, x_test)

ValueError: Found input variables with inconsistent numbers of samples: [710, 2837]