<a href="https://colab.research.google.com/github/mmilannaik/BigOCheatSheet/blob/master/Big_Mart_ABB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, GroupKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

# EDA

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [5]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Data Wrangling

In [6]:
train['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [7]:
test['Item_Fat_Content'].unique()

array(['Low Fat', 'reg', 'Regular', 'LF', 'low fat'], dtype=object)

In [8]:
train['Item_Visibility'].unique()

array([0.0160473 , 0.01927822, 0.01676007, ..., 0.03518627, 0.14522065,
       0.04487828])

In [9]:
len(train['Item_Identifier'].unique())

1559

In [10]:
train[train['Item_Weight'].isna()].sample(4)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7425,FDB26,,Regular,0.031116,Canned,55.264,OUT027,1985,Medium,Tier 3,Supermarket Type3,1544.656
2687,NCL53,,Low Fat,0.0,Health and Hygiene,175.4028,OUT027,1985,Medium,Tier 3,Supermarket Type3,6729.9064
7747,FDE56,,Regular,0.158425,Fruits and Vegetables,62.4194,OUT027,1985,Medium,Tier 3,Supermarket Type3,2291.0178
6398,NCW18,,Low Fat,0.059038,Household,237.7248,OUT027,1985,Medium,Tier 3,Supermarket Type3,2607.2728


In [11]:
train[train['Item_Visibility'].isna()]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [12]:
train['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [13]:
train['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [14]:
train.groupby('Outlet_Type')['Outlet_Size'].agg(lambda x:x.mode())

Unnamed: 0_level_0,Outlet_Size
Outlet_Type,Unnamed: 1_level_1
Grocery Store,Small
Supermarket Type1,Small
Supermarket Type2,Medium
Supermarket Type3,Medium


In [15]:
train['Outlet_Size'].value_counts().idxmax()

'Medium'

In [16]:
test.groupby('Outlet_Type')['Outlet_Size'].agg(lambda x:x.mode())

Unnamed: 0_level_0,Outlet_Size
Outlet_Type,Unnamed: 1_level_1
Grocery Store,Small
Supermarket Type1,Small
Supermarket Type2,Medium
Supermarket Type3,Medium


In [17]:
train['Outlet_Establishment_Year'].unique()

array([1999, 2009, 1998, 1987, 1985, 2002, 2007, 1997, 2004])

In [18]:
test['Outlet_Establishment_Year'].unique()

array([1999, 2007, 1998, 1985, 1997, 2009, 2002, 1987, 2004])

In [19]:
test['Item_Identifier'].str[:2].unique()

array(['FD', 'NC', 'DR'], dtype=object)

In [20]:
len(train['Item_MRP'].unique())

5938

In [21]:
len(test['Item_MRP'].unique())

4402

In [22]:
train['Item']

KeyError: 'Item'

In [23]:

# Combine (key allows easy split back)
if test is not None:
    combined = pd.concat({'train': train, 'test': test}, axis=0, sort=False).reset_index(level=0).rename(columns={'level_0':'_split'})
else:
    combined = train.copy()
    combined['_split'] = 'train'

combined.head(3)

Unnamed: 0,_split,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,train,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,train,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,train,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27


## Utils

In [24]:
def normalize_fat(x):
  mapping = {'LF':'Low Fat','low fat':'Low Fat','Low Fat':'Low Fat','reg':'Regular',
            'Regular':'Regular'}
  return x.replace(mapping)

def clean_data(df):

  if 'Item_Visibility' in df.columns:
    vis = df['Item_Visibility'].replace(0,np.nan)
    med_per_item = vis.groupby(df['Item_Identifier']).transform('median')
    df['Item_Visibility'] = np.where(df['Item_Visibility']==0,med_per_item,df['Item_Visibility'])
    # For remaining nans
    df['Item_Visibility'] = df['Item_Visibility'].fillna(df['Item_Visibility'].median())

  if 'Item_Weight' in df.columns:
    mean_per_item = df.groupby('Item_Identifier')['Item_Weight'].transform('mean')
    df['Item_Weight'] = df['Item_Weight'].fillna(mean_per_item)
    df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].median())

  if 'Outlet_Size' in df.columns and 'Outlet_Type' in df.columns:
    mode_map = df.groupby('Outlet_Type')['Outlet_Size'].apply(lambda s:s.value_counts().idxmax())
    # Fallback
    global_mode = df['Outlet_Size'].value_counts().idxmax()
    df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Type'].map(mode_map)).fillna(global_mode)

  # Outlet_Age
  if 'Outlet_Establishment_Year' in df.columns:
      df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']

  # Item_Category
  if 'Item_Identifier' in df.columns:
      df['Item_Category'] = df['Item_Identifier'].astype(str).str[:2]

  # Visibility Mean Ratio per item
  if {'Item_Visibility','Item_Identifier'}.issubset(df.columns):
    item_mean = df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
    df['Visibility_MeanRatio'] = df['Item_Visibility']/ item_mean
    df['Visibility_MeanRatio'] = df['Visibility_MeanRatio'].replace([np.inf,-np.inf],np.nan).fillna(1.0)

  if 'Item_MRP' in df.columns:
    try:
      df['MRP_bin'] = pd.qcut(df['Item_MRP'],4,labels = ['Low','Medium','High','Very High'])
    except:
      df['MRP_bin'] = 'Medium'

  # Promotion proxy
  if 'Item_Visibility' in df.columns:
    thr = df['Item_Visibility'].quantile(0.9)
    df['High_Visibility_Flag'] = (df['Item_Visibility']>thr).astype(int)

  #  Interaction : MRP X Outlet_Type
  if {'Item_MRP','Outlet_Type'}.issubset(df.columns):
    outlet_type_codes = df['Outlet_Type'].astype('category').cat.codes
    df['MRP_OutletType'] = df['Item_MRP'] * outlet_type_codes

  # Outlet clustering : encoding three columns
  if {'Outlet_Size','Outlet_Type','Outlet_Location_Type'}.issubset(df.columns):
    tmp = df[['Outlet_Size','Outlet_Type','Outlet_Location_Type']].astype(str).copy()
    for c in tmp.columns:
      le = LabelEncoder()
      tmp[c] = le.fit_transform(tmp[c])
    kmeans = KMeans(n_clusters = 4,random_state = 42,n_init=10)
  return df

clean_df = clean_data(combined)



In [25]:
clean_df.shape

(14204, 19)

In [26]:

if 'Item_Outlet_Sales' not in clean_df.columns:
    raise KeyError("Target 'Item_Outlet_Sales' must be in train.")

train_fe = clean_df[clean_df['_split']=='train'].drop(columns=['_split']).copy()
test_fe  = clean_df[clean_df['_split']=='test'].drop(columns=['_split']).copy() if 'test' in clean_df['_split'].values else None

print(train_fe.shape, getattr(test_fe, 'shape', None))

(8523, 18) (5681, 18)


In [27]:

def oof_target_encode(train_df, test_df, col, target, n_splits=5, alpha=10, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    te_train = pd.Series(index=train_df.index, dtype=float)
    global_mean = train_df[target].mean()

    for tr_idx, va_idx in kf.split(train_df):
        tr, va = train_df.iloc[tr_idx], train_df.iloc[va_idx]
        stats = tr.groupby(col)[target].agg(['mean','count'])
        stats['te'] = (stats['count']*stats['mean'] + alpha*global_mean) / (stats['count'] + alpha)
        mapping = stats['te']
        te_train.iloc[va_idx] = train_df.iloc[va_idx][col].map(mapping).fillna(global_mean)

    full_stats = train_df.groupby(col)[target].agg(['mean','count'])
    full_stats['te'] = (full_stats['count']*full_stats['mean'] + alpha*global_mean) / (full_stats['count'] + alpha)
    full_mapping = full_stats['te']

    te_test = None
    if test_df is not None:
        te_test = test_df[col].map(full_mapping).fillna(global_mean)

    return te_train.astype(float), (te_test.astype(float) if te_test is not None else None)

target_col = 'Item_Outlet_Sales'
te_cols = ['Outlet_Type','Item_Type','Outlet_Identifier','Item_Category']
for c in te_cols:
    if c in train_fe.columns:
        tr_te, te_te = oof_target_encode(train_fe, test_fe, c, target_col, n_splits=5, alpha=10)
        train_fe[c+'_TE'] = tr_te
        if test_fe is not None:
            test_fe[c+'_TE'] = te_te

print("OOF TE added:", [c+'_TE' for c in te_cols if c in train_fe.columns])

OOF TE added: ['Outlet_Type_TE', 'Item_Type_TE', 'Outlet_Identifier_TE', 'Item_Category_TE']


In [28]:

TARGET = 'Item_Outlet_Sales'
y = np.log1p(train_fe[TARGET].values)

X = train_fe.drop(columns=[TARGET])
X_test = test_fe.copy() if test_fe is not None else None

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric:", len(num_cols), "| Categorical:", len(cat_cols))

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

groups = train_fe['Outlet_Identifier'].values if 'Outlet_Identifier' in train_fe.columns else None


Numeric: 12 | Categorical: 9


In [30]:

def cv_rmse(model, X, y, preprocess, groups=None, n_splits=5, random_state=42):
    if groups is not None:
        kf = GroupKFold(n_splits=n_splits)
        splits = kf.split(X, y, groups)
    else:
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        splits = kf.split(X, y)
    scores = []
    for tr, va in splits:
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y[tr], y[va]
        pipe = Pipeline([('prep', preprocess), ('model', model)])
        pipe.fit(X_tr, y_tr)
        y_hat = pipe.predict(X_va)
        rmse = mean_squared_error(y_va, y_hat)
        scores.append(rmse)
    return float(np.mean(scores)), float(np.std(scores))

models = {
    "LinearRegression": LinearRegression(),
    "RidgeCV": RidgeCV(alphas=np.logspace(-3,3,20)),
    "RandomForest": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(n_estimators=800, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8, random_state=42, tree_method="hist"),
    "LightGBM": LGBMRegressor(n_estimators=1000, learning_rate=0.05, subsample=0.8,
                               colsample_bytree=0.8, random_state=42, n_jobs=-1)
}

results = {}
for name, model in models.items():
    mean_rmse, std_rmse = cv_rmse(model, X, y, preprocess, groups=groups, n_splits=5)
    results[name] = (mean_rmse, std_rmse)
    print(f"{name:15s} | CV RMSE (log-target): {mean_rmse:.4f} ± {std_rmse:.4f}")

LinearRegression | CV RMSE (log-target): 0.4671 ± 0.1297
RidgeCV         | CV RMSE (log-target): 0.3641 ± 0.0782
RandomForest    | CV RMSE (log-target): 0.4025 ± 0.1195
XGBoost         | CV RMSE (log-target): 0.3433 ± 0.0615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 7060, number of used features: 57
[LightGBM] [Info] Start training from score 7.331234
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 7036, number of used features: 57
[LightGBM] [Info] 

In [32]:
best_name = min(results, key=lambda k: results[k][0])
print("Best model:", best_name, "->", results[best_name])

best_model = models[best_name]
final_pipe = Pipeline([('prep', preprocess), ('model', best_model)])
final_pipe.fit(X, y)

# Back-transform to original scale for a sense of fit quality (not generalization!)
train_pred_log = final_pipe.predict(X)
train_pred = np.expm1(train_pred_log)
true = train_fe[TARGET].values
train_rmse = mean_squared_error(true, train_pred)
print(f"Train RMSE (original scale): {train_rmse:.2f}")

if X_test is not None:
    test_pred_log = final_pipe.predict(X_test)
    test_pred = np.expm1(test_pred_log)
    sub_cols = [c for c in ['Item_Identifier','Outlet_Identifier'] if c in X_test.columns]
    submission = pd.DataFrame({**{c: X_test[c].values for c in sub_cols}, TARGET: test_pred})
    submission.to_csv('submission.csv', index=False)
    print("Saved submission.csv")
else:
    print("Test not provided; skipping submission.")

Best model: XGBoost -> (0.34329832807839594, 0.06148430956171311)
Train RMSE (original scale): 921154.31
Saved submission.csv


# Tuning

In [33]:
# --- Hyperparameter tuning for XGBoost with GroupKFold ---

from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings

# Safety checks (assumes you already have X, y, preprocess, groups, X_test, test, TARGET)
assert 'preprocess' in globals(), "preprocess (ColumnTransformer) not found"
assert 'X' in globals() and 'y' in globals(), "X/y not found"
assert 'groups' in globals(), "groups not found (use Outlet_Identifier)"
assert 'TARGET' in globals(), "TARGET not found"

In [34]:
# Try to use neg_root_mean_squared_error; fallback if unsupported
scorer = 'neg_root_mean_squared_error'
try:
    from sklearn.metrics import get_scorer
    _ = get_scorer(scorer)
except Exception:
    warnings.warn("neg_root_mean_squared_error not available; using neg_mean_squared_error")
    scorer = 'neg_mean_squared_error'

xgb_base = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'  # fast + stable
)

pipe = Pipeline([
    ('prep', preprocess),
    ('model', xgb_base)
])

param_dist = {
    'model__n_estimators':      [600, 800, 1000, 1200, 1500],
    'model__learning_rate':     [0.02, 0.03, 0.05, 0.07, 0.1],
    'model__max_depth':         [3, 4, 5, 6, 7, 8],
    'model__min_child_weight':  [1, 2, 3, 5, 7, 10],
    'model__subsample':         [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree':  [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__reg_alpha':         [0.0, 1e-3, 1e-2, 1e-1, 1.0],
    'model__reg_lambda':        [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'model__gamma':             [0.0, 0.1, 0.3, 0.5]
}

cv = GroupKFold(n_splits=5)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=50,                 # increase to 100+ if you want to search deeper
    scoring=scorer,
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    refit=True                 # refit on full data using the best params
)

search.fit(X, y, **({'groups': groups} if groups is not None else {}))

best_pipe = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

# Convert score to RMSE (positive) for readability
if scorer == 'neg_root_mean_squared_error':
    cv_rmse = -best_score
else:
    cv_rmse = np.sqrt(-best_score)

print("\n=== XGBoost best params ===")
for k, v in best_params.items():
    print(f"{k}: {v}")
print(f"CV RMSE (log-target): {cv_rmse:.4f}")

# Fit on all training data (already done by refit=True, but we can be explicit)
best_pipe.fit(X, y)

# Train fit quality on original scale (informational only)
train_pred_log = best_pipe.predict(X)
train_pred = np.expm1(train_pred_log)
true = np.expm1(y)  # if y was log1p already; else use train[TARGET].values
mse_train = mean_squared_error(true, train_pred)
train_rmse = np.sqrt(mse_train)
print(f"Train RMSE (original scale): {train_rmse:.2f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits

=== XGBoost best params ===
model__subsample: 0.9
model__reg_lambda: 10.0
model__reg_alpha: 0.0
model__n_estimators: 800
model__min_child_weight: 3
model__max_depth: 6
model__learning_rate: 0.1
model__gamma: 0.5
model__colsample_bytree: 0.6
CV RMSE (log-target): 0.5493
Train RMSE (original scale): 1010.45


In [35]:
# Predict test and save submission if you have test
if 'X_test' in globals() and X_test is not None and 'test' in globals():
    test_pred_log = best_pipe.predict(X_test)
    test_pred = np.expm1(test_pred_log)

    id_cols = [c for c in ['Item_Identifier','Outlet_Identifier'] if c in X_test.columns]
    submission = __import__('pandas').DataFrame({**{c: X_test[c].values for c in id_cols},
                                                 TARGET: test_pred})
    submission.to_csv('submission_updated.csv', index=False)
    print("Saved submission.csv")

Saved submission.csv


# Optuna

In [36]:
!pip -q install optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m358.4/400.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/247.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.4/247.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [40]:
# ---- Optuna hyperparameter tuning for XGBoost (GroupKFold, log-RMSE) ----
# Assumes X, y, preprocess, groups, X_test, test, TARGET are already defined.

import optuna
from optuna.samplers import TPESampler
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

def cv_rmse_logtarget(model, X, y, preprocess, groups, n_splits=5):
    kf = GroupKFold(n_splits=n_splits)
    rmses = []
    for tr_idx, va_idx in kf.split(X, y, groups):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        pipe = Pipeline([('prep', preprocess), ('model', model)])
        pipe.fit(X_tr, y_tr)
        y_hat = pipe.predict(X_va)
        rmse = np.sqrt(mean_squared_error(y_va, y_hat))
        rmses.append(rmse)
    return float(np.mean(rmses)), float(np.std(rmses))

def objective(trial: optuna.Trial):
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'random_state': 42,
        'n_jobs': -1,

        # SEARCH SPACE (lower bounds > 0 where log=True)
        'n_estimators':     trial.suggest_int('n_estimators', 600, 1800),
        'learning_rate':    trial.suggest_float('learning_rate', 0.02, 0.12, log=True),
        'max_depth':        trial.suggest_int('max_depth', 4, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
        'subsample':        trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma':            trial.suggest_float('gamma', 0.0, 0.6),           # not log
        'reg_alpha':        trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),  # FIXED (>0)
        'reg_lambda':       trial.suggest_float('reg_lambda', 1e-3, 50.0, log=True) # >0
    }

    model = XGBRegressor(**params)
    mean_rmse, std_rmse = cv_rmse_logtarget(model, X, y, preprocess, groups, n_splits=5)
    trial.set_user_attr('cv_std', std_rmse)
    return mean_rmse   # Optuna minimizes

study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42),
                            study_name='xgb_bigmart_logrmse_fixed')
n_trials = 200  # increase if you can
print(f"Running Optuna with {n_trials} trials…")
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

print("\n=== Best trial ===")
print(f"Value (CV RMSE log-target): {study.best_value:.4f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")
print(f"CV RMSE std (best trial): {study.best_trial.user_attrs.get('cv_std'):.4f}")




[I 2025-08-31 14:38:15,519] A new study created in memory with name: xgb_bigmart_logrmse_fixed


Running Optuna with 200 trials…


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-08-31 14:38:44,295] Trial 0 finished with value: 0.6191216133504669 and parameters: {'n_estimators': 1049, 'learning_rate': 0.10985745201142037, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 0.03485016730091967, 'reg_alpha': 0.08499808989182997, 'reg_lambda': 0.6677615511747083}. Best is trial 0 with value: 0.6191216133504669.
[I 2025-08-31 14:39:47,716] Trial 1 finished with value: 0.5909728168100898 and parameters: {'n_estimators': 1450, 'learning_rate': 0.020751421204777677, 'max_depth': 9, 'min_child_weight': 7, 'subsample': 0.6849356442713105, 'colsample_bytree': 0.6727299868828402, 'gamma': 0.11004270591206029, 'reg_alpha': 2.716051144654844e-06, 'reg_lambda': 0.2922905212920093}. Best is trial 1 with value: 0.5909728168100898.
[I 2025-08-31 14:40:12,187] Trial 2 finished with value: 0.5806021603182601 and parameters: {'n_estimators': 1118, 'learning_rate': 0.03370158413473581, 'max_depth': 7, 'min_

In [41]:
# ---- Train best pipeline on full data & export submission ----
best_model = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    **study.best_params
)

best_pipe = Pipeline([('prep', preprocess), ('model', best_model)])
best_pipe.fit(X, y)

# Train fit quality (original scale)
train_pred_log = best_pipe.predict(X)
train_pred = np.expm1(train_pred_log)
true = np.expm1(y)  # because y = log1p(target)
train_rmse = np.sqrt(mean_squared_error(true, train_pred))
print(f"\nTrain RMSE (original scale): {train_rmse:.2f}")

# Predict test and save
if 'X_test' in globals() and X_test is not None and 'test' in globals():
    test_pred_log = best_pipe.predict(X_test)
    test_pred = np.expm1(test_pred_log)
    id_cols = [c for c in ['Item_Identifier','Outlet_Identifier'] if c in X_test.columns]
    submission = __import__('pandas').DataFrame({**{c: X_test[c].values for c in id_cols},
                                                 TARGET: test_pred})
    submission.to_csv('submission_optuna_xgb.csv', index=False)
    print("Saved submission_optuna_xgb_2.csv")


Train RMSE (original scale): 1067.81
Saved submission_optuna_xgb_2.csv


# Ensemble

In [42]:
# ==== Blended Ensemble: XGBoost + LightGBM (GroupKFold OOF) ====
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# --- Safety checks: we expect these to exist from your previous steps
assert 'X' in globals() and 'y' in globals(), "Need X, y"
assert 'preprocess' in globals(), "Need preprocess (ColumnTransformer)"
assert 'groups' in globals(), "Need groups (e.g., Outlet_Identifier)"
assert 'TARGET' in globals(), "Need TARGET"
X_test_exists = 'X_test' in globals() and X_test is not None and 'test' in globals()

# --- Use your Optuna-best XGB params (from your message)
xgb_params = dict(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    n_estimators=835,
    learning_rate=0.09607968068574532,
    max_depth=5,
    min_child_weight=8,
    subsample=0.9894332087880474,
    colsample_bytree=0.6087803462102492,
    gamma=0.597589098655773,
    reg_alpha=0.6901641485168402,
    reg_lambda=0.0016653679648540084,
)

# --- A strong, general LGBM baseline (you can tune later)
lgb_params = dict(
    n_estimators=1600,
    learning_rate=0.05,
    num_leaves=63,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.9,
    colsample_bytree=0.7,
    reg_alpha=0.3,
    reg_lambda=0.3,
    random_state=42,
    n_jobs=-1
)

pipe_xgb = Pipeline([('prep', preprocess), ('model', XGBRegressor(**xgb_params))])
pipe_lgb = Pipeline([('prep', preprocess), ('model', LGBMRegressor(**lgb_params))])

kf = GroupKFold(n_splits=5)
oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))

print("Training OOF models for blending…")
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y, groups), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    # Fit each model on train fold
    px = Pipeline([('prep', preprocess), ('model', XGBRegressor(**xgb_params))])
    pl = Pipeline([('prep', preprocess), ('model', LGBMRegressor(**lgb_params))])

    px.fit(X_tr, y_tr)
    pl.fit(X_tr, y_tr)

    oof_xgb[va_idx] = px.predict(X_va)
    oof_lgb[va_idx] = pl.predict(X_va)

    rmse_x = np.sqrt(mean_squared_error(y[va_idx], oof_xgb[va_idx]))
    rmse_l = np.sqrt(mean_squared_error(y[va_idx], oof_lgb[va_idx]))
    print(f"  Fold {fold}: XGB RMSE={rmse_x:.4f}  |  LGB RMSE={rmse_l:.4f}")

# Try blend weights and pick the best OOF RMSE (on log-target)
weights = [0.5, 0.55, 0.6, 0.65, 0.7]
best_w, best_rmse = None, 1e9
for w in weights:
    blend = w * oof_xgb + (1 - w) * oof_lgb
    rmse = np.sqrt(mean_squared_error(y, blend))
    print(f"Blend w={w:.2f}  ->  OOF RMSE (log-target): {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse, best_w = rmse, w

print(f"\nBest blend weight: w_xgb={best_w:.2f}, w_lgb={(1-best_w):.2f} | OOF RMSE (log): {best_rmse:.4f}")

# ---- Refit both models on ALL training data
print("\nRefitting both models on all training data…")
final_xgb = Pipeline([('prep', preprocess), ('model', XGBRegressor(**xgb_params))])
final_lgb = Pipeline([('prep', preprocess), ('model', LGBMRegressor(**lgb_params))])

final_xgb.fit(X, y)
final_lgb.fit(X, y)

# Train fit (informational) — original scale
pred_train_log = best_w * final_xgb.predict(X) + (1 - best_w) * final_lgb.predict(X)
pred_train = np.expm1(pred_train_log)
true_train = np.expm1(y)
train_rmse = np.sqrt(mean_squared_error(true_train, pred_train))
print(f"Train RMSE (original scale, blended): {train_rmse:.2f}")

# ---- Predict test & save submission
if X_test_exists:
    pred_test_log = best_w * final_xgb.predict(X_test) + (1 - best_w) * final_lgb.predict(X_test)
    pred_test = np.expm1(pred_test_log)

    id_cols = [c for c in ['Item_Identifier', 'Outlet_Identifier'] if c in X_test.columns]
    submission = pd.DataFrame({**{c: X_test[c].values for c in id_cols},
                               TARGET: pred_test})
    submission.to_csv('submission_ensemble.csv', index=False)
    print("Saved submission_ensemble.csv")
else:
    print("No X_test/test detected — skipped submission write.")


Training OOF models for blending…
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 7060, number of used features: 57
[LightGBM] [Info] Start training from score 7.331234
  Fold 1: XGB RMSE=0.5585  |  LGB RMSE=0.6570
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1520
[LightGBM] [Info] Number of data points in the train set: 7036, number of used features: 57
[LightGBM] [Info] Start training from score 7.412878
  Fold 2: XGB RMSE=0.5954  |  LGB RMSE=0.6445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_w