In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [3]:
train

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Small,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,Small,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
cols_to_check = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

print("--- Class Consistency Check ---")
for col in cols_to_check:
    train_classes = set(train[col].unique())
    test_classes = set(test[col].unique())

    train_only = train_classes - test_classes
    test_only = test_classes - train_classes

    if len(train_only) == 0 and len(test_only) == 0:
        print(f" {col}: Perfect Match")
    else:
        print(f" {col} MISMATCH:")
        if train_only: print(f"   - In Train only: {train_only}")
        if test_only:  print(f"   - In Test only:  {test_only}")

--- Class Consistency Check ---
 Item_Fat_Content: Perfect Match
 Item_Type: Perfect Match
 Outlet_Size: Perfect Match
 Outlet_Location_Type: Perfect Match
 Outlet_Type: Perfect Match


In [5]:
train['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [6]:
test['Item_Fat_Content'].unique()

array(['Low Fat', 'reg', 'Regular', 'LF', 'low fat'], dtype=object)

In [7]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train, test], ignore_index=True)

data['Item_Identifier'].value_counts()

Unnamed: 0_level_0,count
Item_Identifier,Unnamed: 1_level_1
FDX13,10
NCU29,10
FDP21,10
NCT53,10
NCS29,10
...,...
FDM50,7
NCL42,7
FDM10,7
NCW54,7


In [8]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})


data['Item_Category'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Category'] = data['Item_Category'].map({
    'FD': 'Food',
    'NC': 'Non-Consumable',
    'DR': 'Drinks'
})


data.loc[data['Item_Category'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'


data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']



var_mod = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Size', 'Item_Category', 'Outlet_Type', 'Outlet_Identifier']
data = pd.get_dummies(data, columns=var_mod)


data.drop(['Item_Type', 'Outlet_Establishment_Year'], axis=1, inplace=True)

train_final = data.loc[data['source'] == 'train'].drop(['source', 'Item_Identifier'], axis=1)
test_final = data.loc[data['source'] == 'test'].drop(['source', 'Item_Outlet_Sales', 'Item_Identifier'], axis=1)

print(f"Final Train Shape: {train_final.shape}")
print(f"Final Test Shape: {test_final.shape}")
print("\nColumns available for Model:")
print(train_final.columns.tolist())

Final Train Shape: (8523, 31)
Final Test Shape: (5681, 30)

Columns available for Model:
['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales', 'Outlet_Years', 'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Non-Edible', 'Item_Fat_Content_Regular', 'Outlet_Location_Type_Tier 1', 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3', 'Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small', 'Item_Category_Drinks', 'Item_Category_Food', 'Item_Category_Non-Consumable', 'Outlet_Type_Grocery Store', 'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3', 'Outlet_Identifier_OUT010', 'Outlet_Identifier_OUT013', 'Outlet_Identifier_OUT017', 'Outlet_Identifier_OUT018', 'Outlet_Identifier_OUT019', 'Outlet_Identifier_OUT027', 'Outlet_Identifier_OUT035', 'Outlet_Identifier_OUT045', 'Outlet_Identifier_OUT046', 'Outlet_Identifier_OUT049']


In [9]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8523 entries, 0 to 8522
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Item_Weight                    8523 non-null   float64
 1   Item_Visibility                8523 non-null   float64
 2   Item_MRP                       8523 non-null   float64
 3   Item_Outlet_Sales              8523 non-null   float64
 4   Outlet_Years                   8523 non-null   int64  
 5   Item_Fat_Content_Low Fat       8523 non-null   bool   
 6   Item_Fat_Content_Non-Edible    8523 non-null   bool   
 7   Item_Fat_Content_Regular       8523 non-null   bool   
 8   Outlet_Location_Type_Tier 1    8523 non-null   bool   
 9   Outlet_Location_Type_Tier 2    8523 non-null   bool   
 10  Outlet_Location_Type_Tier 3    8523 non-null   bool   
 11  Outlet_Size_High               8523 non-null   bool   
 12  Outlet_Size_Medium             8523 non-null   bool  

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

X = train_final.drop('Item_Outlet_Sales', axis=1)
y = train_final['Item_Outlet_Sales']

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
fold_no = 1

print(f"Starting 5-Fold Cross-Validation...\n")

for train_index, val_index in kf.split(X):
    # Split data
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]


    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=42,
        early_stopping_rounds=200
    )

    # Train
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=0
    )

    # Predict & Evaluate
    preds = model.predict(X_val_fold)
    rmse = np.sqrt(mean_squared_error(y_val_fold, preds))

    print(f"Fold {fold_no}: RMSE = {rmse:.4f}  (Stopped at iteration {model.best_iteration})")
    cv_scores.append(rmse)
    fold_no += 1

mean_score = np.mean(cv_scores)
std_score = np.std(cv_scores)

print(f"\n------------------------------------------------")
print(f"Average RMSE: {mean_score:.4f}")
print(f"Standard Deviation: {std_score:.4f}")
print(f"------------------------------------------------")

Starting 5-Fold Cross-Validation...

Fold 1: RMSE = 1024.8716  (Stopped at iteration 71)
Fold 2: RMSE = 1076.5180  (Stopped at iteration 72)
Fold 3: RMSE = 1067.8386  (Stopped at iteration 83)
Fold 4: RMSE = 1120.4865  (Stopped at iteration 97)
Fold 5: RMSE = 1119.4620  (Stopped at iteration 77)

------------------------------------------------
Average RMSE: 1081.8353
Standard Deviation: 35.7176
------------------------------------------------


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

param_grid = {
    'n_estimators': [80, 100, 250],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5]
}

xgb_base = XGBRegressor(n_jobs=-1, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("Starting Hyperparameter Tuning (this may take a few minutes)...")

random_search.fit(X, y)

best_params = random_search.best_params_
best_score = -random_search.best_score_

print(f"\n Best Validation RMSE: {best_score:.4f}")
print("\nBest Parameters found:")
for param, value in best_params.items():
    print(f"  - {param}: {value}")

Starting Hyperparameter Tuning (this may take a few minutes)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

 Best Validation RMSE: 1080.1680

Best Parameters found:
  - subsample: 0.8
  - n_estimators: 80
  - min_child_weight: 5
  - max_depth: 4
  - learning_rate: 0.05
  - colsample_bytree: 0.8


In [12]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

final_model = XGBRegressor(
    n_estimators=95,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
    enable_categorical=True,
    early_stopping_rounds=50
)

print("Training Final Model on Full Data...")
final_model.fit(X, y,
                eval_set=[(X, y)],
                verbose=10)

train_preds = final_model.predict(X)

# RMSE
mse = mean_squared_error(y, train_preds)
rmse = np.sqrt(mse)

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y, train_preds)

# MAPE (Mean Absolute Percentage Error)
mape = mean_absolute_percentage_error(y, train_preds)

print("\n--- Final Model Training Performance ---")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"MAPE: {mape:.4f} (Avg error is about {mape*100:.2f}%)")

Training Final Model on Full Data...
[0]	validation_0-rmse:1655.87943
[10]	validation_0-rmse:1340.38814
[20]	validation_0-rmse:1177.06047
[30]	validation_0-rmse:1108.68362
[40]	validation_0-rmse:1075.37118
[50]	validation_0-rmse:1058.41690
[60]	validation_0-rmse:1047.84265
[70]	validation_0-rmse:1037.80046
[80]	validation_0-rmse:1028.61672
[90]	validation_0-rmse:1021.98878
[94]	validation_0-rmse:1019.41893

--- Final Model Training Performance ---
RMSE: 1019.4189
MAE:  718.0297
MAPE: 0.5410 (Avg error is about 54.10%)


In [13]:
test_predictions = final_model.predict(test_final)

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})


submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

submission_filename = 'BigMart_Submission_XGB_Final.csv'
submission.to_csv(submission_filename, index=False)

print(f" Success! Submission file '{submission_filename}' created.")
print("\nFirst 5 rows of submission:")
print(submission.head())

 Success! Submission file 'BigMart_Submission_XGB_Final.csv' created.

First 5 rows of submission:
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1613.369751
1           FDW14            OUT017        1381.852905
2           NCN55            OUT010         652.737793
3           FDQ58            OUT017        2490.801270
4           FDY38            OUT027        6199.764648


In [14]:
submission.to_csv('BigMart_Submission_XGB_Final.csv', index=False)

In [15]:
!pip install catboost



In [16]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

X = train_final.drop('Item_Outlet_Sales', axis=1)
y = train_final['Item_Outlet_Sales']

X_test = test_final[X.columns]

print(f"Training on {X.shape[0]} rows and {X.shape[1]} features.")

# ==========================================
# MODEL 1: XGBoost
# ==========================================
print("\n1. Training XGBoost...")
xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)
xgb.fit(X, y)
xgb_pred = xgb.predict(X_test)
print("   XGBoost training complete.")

# ==========================================
# MODEL 2: LightGBM
# ==========================================
print("\n2. Training LightGBM...")
lgbm = LGBMRegressor(
    n_estimators=70,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1,
    random_state=42,
    verbose=-1
)
lgbm.fit(X, y)
lgbm_pred = lgbm.predict(X_test)
print("   LightGBM training complete.")

# ==========================================
# MODEL 3: CatBoost
# ==========================================
print("\n3. Training CatBoost...")
cat = CatBoostRegressor(
    n_estimators=100,
    learning_rate=0.05,
    depth=6,
    random_state=42,
    verbose=0,
    allow_writing_files=False
)
cat.fit(X, y)
cat_pred = cat.predict(X_test)
print("   CatBoost training complete.")


w_xgb = 0.40
w_lgbm = 0.30
w_cat = 0.30

ensemble_pred = (w_xgb * xgb_pred) + (w_lgbm * lgbm_pred) + (w_cat * cat_pred)

print(f"\nEnsemble Weights: XGB={w_xgb}, LGBM={w_lgbm}, CAT={w_cat}")


submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': ensemble_pred
})


submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

filename = 'BigMart_Ensemble_Submission.csv'
submission.to_csv(filename, index=False)

print(f"\n SUCCESS! File saved as '{filename}'")
print(submission.head())

Training on 8523 rows and 30 features.

1. Training XGBoost...
   XGBoost training complete.

2. Training LightGBM...
   LightGBM training complete.

3. Training CatBoost...
   CatBoost training complete.

Ensemble Weights: XGB=0.4, LGBM=0.3, CAT=0.3

 SUCCESS! File saved as 'BigMart_Ensemble_Submission.csv'
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1657.774469
1           FDW14            OUT017        1391.398777
2           NCN55            OUT010         625.671026
3           FDQ58            OUT017        2492.616669
4           FDY38            OUT027        6050.936049


In [17]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

X = train_final.drop('Item_Outlet_Sales', axis=1)
y = train_final['Item_Outlet_Sales']
X_test = test_final[X.columns]

print("Initializing CatBoost...")
cat = CatBoostRegressor(
    n_estimators=100,
    learning_rate=0.05,
    depth=6,
    random_state=42,
    verbose=10,
    allow_writing_files=False
)

print("Training started...")
cat.fit(X, y)

cat_preds = cat.predict(X_test)

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': cat_preds
})

submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

filename = 'BigMart_Submission_CatBoost_100.csv'
submission.to_csv(filename, index=False)

print(f"\n SUCCESS! File saved as '{filename}'")
print(submission.head())

Initializing CatBoost...
Training started...
0:	learn: 1658.8454058	total: 2.79ms	remaining: 276ms
10:	learn: 1333.7458622	total: 29ms	remaining: 235ms
20:	learn: 1182.3328058	total: 55.7ms	remaining: 209ms
30:	learn: 1118.1411566	total: 107ms	remaining: 238ms
40:	learn: 1089.5770700	total: 185ms	remaining: 267ms
50:	learn: 1076.6538427	total: 267ms	remaining: 256ms
60:	learn: 1070.1675901	total: 331ms	remaining: 212ms
70:	learn: 1064.9044143	total: 429ms	remaining: 175ms
80:	learn: 1061.1166718	total: 520ms	remaining: 122ms
90:	learn: 1057.7393547	total: 633ms	remaining: 62.6ms
99:	learn: 1055.2245035	total: 706ms	remaining: 0us

 SUCCESS! File saved as 'BigMart_Submission_CatBoost_100.csv'
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1707.339984
1           FDW14            OUT017        1428.118488
2           NCN55            OUT010         644.111440
3           FDQ58            OUT017        2596.391336
4           FDY38       

In [18]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

X = train_final.drop('Item_Outlet_Sales', axis=1)
y = train_final['Item_Outlet_Sales']

X_test = test_final[X.columns]

print("Initializing LightGBM...")
lgbm = LGBMRegressor(
    n_estimators=70,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

print("Training started...")
lgbm.fit(X, y)

lgbm_preds = lgbm.predict(X_test)

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': lgbm_preds
})

submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

filename = 'BigMart_Submission_LGBM_100.csv'
submission.to_csv(filename, index=False)

print(f"\n SUCCESS! File saved as '{filename}'")
print(submission.head())

Initializing LightGBM...
Training started...

 SUCCESS! File saved as 'BigMart_Submission_LGBM_100.csv'
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1680.445845
1           FDW14            OUT017        1366.424677
2           NCN55            OUT010         571.233731
3           FDQ58            OUT017        2387.313193
4           FDY38            OUT027        5996.026843


In [19]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

final_preds_series = pd.Series(index=test_final.index, dtype=float)

size_segments = ['Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small']

for segment_col in size_segments:
    print(f"\n--- Processing Segment: {segment_col} ---")

    train_mask = train_final[segment_col] == 1
    X_segment_train = train_final.loc[train_mask].drop('Item_Outlet_Sales', axis=1)
    y_segment_train = train_final.loc[train_mask, 'Item_Outlet_Sales']

    test_mask = test_final[segment_col] == 1
    X_segment_test = test_final.loc[test_mask]

    if len(X_segment_train) == 0:
        print(f"Skipping {segment_col}: No training data found.")
        continue

    print(f"Training on {len(X_segment_train)} rows...")

    model = XGBRegressor(
        n_estimators=50,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_segment_train, y_segment_train)

    if len(X_segment_test) > 0:
        preds = model.predict(X_segment_test)

        final_preds_series.loc[test_mask] = preds
        print(f"Predicted for {len(X_segment_test)} test rows.")

if final_preds_series.isna().sum() > 0:
    print(f"\n Found {final_preds_series.isna().sum()} rows with no Outlet_Size. Training fallback model...")

    global_model = XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1, random_state=42)
    X_full = train_final.drop('Item_Outlet_Sales', axis=1)
    y_full = train_final['Item_Outlet_Sales']
    global_model.fit(X_full, y_full)

    missing_mask = final_preds_series.isna()
    X_missing = test_final.loc[missing_mask]
    fallback_preds = global_model.predict(X_missing)
    final_preds_series.loc[missing_mask] = fallback_preds

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': final_preds_series.values
})

submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

filename = 'BigMart_Submission_Segmented_Size.csv'
submission.to_csv(filename, index=False)

print(f"\n SUCCESS! Segmented submission saved as '{filename}'")
print(submission.head())


--- Processing Segment: Outlet_Size_High ---
Training on 932 rows...
Predicted for 621 test rows.

--- Processing Segment: Outlet_Size_Medium ---
Training on 2793 rows...
Predicted for 1862 test rows.

--- Processing Segment: Outlet_Size_Small ---
Training on 4798 rows...
Predicted for 3198 test rows.

 SUCCESS! Segmented submission saved as 'BigMart_Submission_Segmented_Size.csv'
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1697.109131
1           FDW14            OUT017        1526.368408
2           NCN55            OUT010         681.101685
3           FDQ58            OUT017        2412.045166
4           FDY38            OUT027        5978.749512


In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

final_preds_series = pd.Series(index=test_final.index, dtype=float)


type_segments = [
    'Outlet_Type_Grocery Store',
    'Outlet_Type_Supermarket Type1',
    'Outlet_Type_Supermarket Type2',
    'Outlet_Type_Supermarket Type3'
]

for segment_col in type_segments:
    print(f"\n--- Processing Segment: {segment_col} ---")

    train_mask = train_final[segment_col] == 1
    X_segment_train = train_final.loc[train_mask].drop('Item_Outlet_Sales', axis=1)
    y_segment_train = train_final.loc[train_mask, 'Item_Outlet_Sales']

    test_mask = test_final[segment_col] == 1
    X_segment_test = test_final.loc[test_mask]

    if len(X_segment_train) == 0:
        continue

    print(f"Training on {len(X_segment_train)} rows (Test rows: {len(X_segment_test)})...")

    model = XGBRegressor(
        n_estimators=55,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_segment_train, y_segment_train)

    if len(X_segment_test) > 0:
        preds = model.predict(X_segment_test)
        final_preds_series.loc[test_mask] = preds

if final_preds_series.isna().sum() > 0:
    print(f"\n Filling {final_preds_series.isna().sum()} missing rows with Global Average...")
    final_preds_series.fillna(train_final['Item_Outlet_Sales'].mean(), inplace=True)

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': final_preds_series.values
})

submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x < 0 else x)

filename = 'BigMart_Submission_Segmented_OutletType.csv'
submission.to_csv(filename, index=False)

print(f"\n SUCCESS! Segmented submission saved as '{filename}'")
print(submission.head())


--- Processing Segment: Outlet_Type_Grocery Store ---
Training on 1083 rows (Test rows: 722)...

--- Processing Segment: Outlet_Type_Supermarket Type1 ---
Training on 5577 rows (Test rows: 3717)...

--- Processing Segment: Outlet_Type_Supermarket Type2 ---
Training on 928 rows (Test rows: 618)...

--- Processing Segment: Outlet_Type_Supermarket Type3 ---
Training on 935 rows (Test rows: 624)...

 SUCCESS! Segmented submission saved as 'BigMart_Submission_Segmented_OutletType.csv'
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1675.234253
1           FDW14            OUT017        1433.177490
2           NCN55            OUT010         638.495056
3           FDQ58            OUT017        2371.996094
4           FDY38            OUT027        6198.966797
