In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import warnings
import os

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- 1. Data Loading & Initial Prep ---
print("Loading data...")
BASE_PATH = '/kaggle/input/playground-series-s5e6/'
train_df = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(BASE_PATH, 'test.csv'))
print("Data loaded.")

# --- 2. Stacking Architecture Setup ---
# We will use the original train/test dataframes inside the CV loop
# to handle Target Encoding correctly and prevent data leakage.

target_encoder = LabelEncoder()
train_df['Fertilizer Name'] = target_encoder.fit_transform(train_df['Fertilizer Name'])
N_CLASSES = len(target_encoder.classes_)

# Create placeholders for the Level 1 training data (OOF predictions)
oof_lgbm = np.zeros((len(train_df), N_CLASSES))
oof_xgb = np.zeros((len(train_df), N_CLASSES))
oof_cat = np.zeros((len(train_df), N_CLASSES))

# Create placeholders for the Level 1 test data
test_lgbm = np.zeros((len(test_df), N_CLASSES))
test_xgb = np.zeros((len(test_df), N_CLASSES))
test_cat = np.zeros((len(test_df), N_CLASSES))


# --- 3. Level 0 Models with Target Encoding in CV Loop ---
print("\n--- Starting Level 0 Model Training ---")
N_SPLITS = 5 # Using 5 folds for robustness
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Using our best 3 models for the base layer
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['Fertilizer Name'])):
    print(f"===== Fold {fold+1} =====")
    
    # --- Create data for this fold ---
    train_fold_df = train_df.iloc[train_idx]
    val_fold_df = train_df.iloc[val_idx]
    test_fold_df = test_df.copy() # Use a copy for each fold's processing
    
    # --- Target Encoding (the SAFE way) ---
    # Calculate encoding on the training part of the fold ONLY
    te_cols = ['Soil Type', 'Crop Type']
    for col in te_cols:
        # Create a mapping from category to the mean of the target
        target_mean = train_fold_df.groupby(col)['Fertilizer Name'].mean()
        # Apply the mapping to all splits for this fold
        train_fold_df[col + '_te'] = train_fold_df[col].map(target_mean)
        val_fold_df[col + '_te'] = val_fold_df[col].map(target_mean)
        test_fold_df[col + '_te'] = test_fold_df[col].map(target_mean)
        # Fill any missing values in val/test with the global mean
        val_fold_df[col + '_te'].fillna(train_df['Fertilizer Name'].mean(), inplace=True)
        test_fold_df[col + '_te'].fillna(train_df['Fertilizer Name'].mean(), inplace=True)

    # --- Feature Engineering (as before) ---
    def create_features(df):
        df['N_P_Ratio'] = df['Nitrogen'] / (df['Phosphorous'] + 1e-6)
        df['N_K_Ratio'] = df['Nitrogen'] / (df['Potassium'] + 1e-6)
        df['P_K_Ratio'] = df['Phosphorous'] / (df['Potassium'] + 1e-6)
        df['Total_Nutrients'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
        es = 0.6108 * np.exp((17.27 * df['Temparature']) / (df['Temparature'] + 237.3))
        ea = (df['Humidity'] / 100) * es
        df['VPD'] = es - ea
        return df

    train_fold_df = create_features(train_fold_df)
    val_fold_df = create_features(val_fold_df)
    test_fold_df = create_features(test_fold_df)
    
    # --- Define Features & Final Prep ---
    features = [col for col in train_fold_df.columns if col not in ['id', 'Fertilizer Name', 'Soil Type', 'Crop Type']]
    X_train, y_train = train_fold_df[features], train_fold_df['Fertilizer Name']
    X_val, y_val = val_fold_df[features], val_fold_df['Fertilizer Name']
    X_test = test_fold_df[features]

    # --- Train Models ---
    LGBM_PARAMS = {'objective': 'multiclass', 'metric': 'multi_logloss', 'n_estimators': 2000, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'num_leaves': 31, 'verbose': -1, 'seed': 42}
    print("Training LGBM...")
    lgbm = lgb.LGBMClassifier(**LGBM_PARAMS).fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    oof_lgbm[val_idx] = lgbm.predict_proba(X_val)
    test_lgbm += lgbm.predict_proba(X_test) / N_SPLITS

    XGB_PARAMS = {'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'n_estimators': 2000, 'learning_rate': 0.01, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42, 'tree_method': 'hist', 'early_stopping_rounds': 100}
    print("Training XGB...")
    xgb_m = xgb.XGBClassifier(**XGB_PARAMS).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    oof_xgb[val_idx] = xgb_m.predict_proba(X_val)
    test_xgb += xgb_m.predict_proba(X_test) / N_SPLITS

    CAT_PARAMS = {'objective': 'MultiClass', 'eval_metric': 'MultiClass', 'iterations': 2000, 'learning_rate': 0.01, 'depth': 6, 'random_seed': 42, 'verbose': 0, 'early_stopping_rounds': 100}
    print("Training CatBoost...")
    cat = ctb.CatBoostClassifier(**CAT_PARAMS).fit(X_train, y_train, eval_set=[(X_val, y_val)])
    oof_cat[val_idx] = cat.predict_proba(X_val)
    test_cat += cat.predict_proba(X_test) / N_SPLITS

# --- 4. Level 1 Meta-Model ---
print("\n--- Training Level 1 Meta-Model ---")
# Concatenate the OOF predictions to create the training data for the meta-model
meta_train_features = np.concatenate((oof_lgbm, oof_xgb, oof_cat), axis=1)
# Concatenate the test predictions to create the test data for the meta-model
meta_test_features = np.concatenate((test_lgbm, test_xgb, test_cat), axis=1)

# A simple logistic regression is a classic, stable choice for a meta-model
meta_model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1000)
meta_model.fit(meta_train_features, train_df['Fertilizer Name'])
print("Meta-Model training complete.")

# --- 5. Final Prediction ---
print("\n--- Generating Final Predictions from Stacked Ensemble ---")
final_preds_proba = meta_model.predict_proba(meta_test_features)

top_3_indices = np.argsort(final_preds_proba, axis=1)[:, ::-1][:, :3]
top_3_labels = target_encoder.inverse_transform(top_3_indices.flatten()).reshape(top_3_indices.shape)
predictions = [' '.join(preds) for preds in top_3_labels]

submission_df = pd.DataFrame({'id': test_df['id'], 'Fertilizer Name': predictions})
submission_df.to_csv('submission_stacking.csv', index=False)
print("\nSubmission file 'submission_stacking.csv' created.")
print(submission_df.head())

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading data...
Data loaded.

--- Starting Level 0 Model Training ---
===== Fold 1 =====
Training LGBM...
Training XGB...
Training CatBoost...
===== Fold 2 =====
Training LGBM...
Training XGB...
Training CatBoost...
===== Fold 3 =====
Training LGBM...
Training XGB...
Training CatBoost...
===== Fold 4 =====
Training LGBM...
Training XGB...
Training CatBoost...
===== Fold 5 =====
Training LGBM...
Training XGB...
Training CatBoost...

--- Training Level 1 Meta-Model ---
Meta-Model training complete.

--- Generating Final Predictions from Stacked Ensemble ---

Submission file 'submission_stacking.csv' created.
       id             Fertilizer Name
0  750000          DAP 14-35-14 28-28
1  750001     17-17-17 20-20 10-26-26
2  750002     20-20 10-26-26 14-35-14
3  750003  14-35-14 17-17-17 10-26-26
4  750004     20-20 10-26-26 17-17-17
