In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import warnings
import os

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- 1. Data Loading (Kaggle Environment) ---

# Define file paths for the Kaggle environment
BASE_PATH = '/kaggle/input/playground-series-s5e6/'
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')

print("Loading data...")
try:
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    sample_submission_df = pd.read_csv(SUBMISSION_PATH)
except FileNotFoundError:
    print(f"Error: Make sure the CSV files are located in '{BASE_PATH}'")
    # Fallback for local testing if needed
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')


print("Data loaded successfully.")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Separate target variable and IDs early on
y = train_df['Fertilizer Name']
train_ids = train_df['id']
test_ids = test_df['id']

# Drop unnecessary columns and the target from the training set
train_df = train_df.drop(columns=['id', 'Fertilizer Name'])
test_df = test_df.drop(columns=['id'])


# --- 2. Advanced Feature Engineering ---
print("\n--- Starting Advanced Feature Engineering ---")

# Combine train and test sets for consistent feature creation
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# a) Nutrient Ratios and Totals
epsilon = 1e-6 # Add a small epsilon to prevent division by zero errors
combined_df['N_P_Ratio'] = combined_df['Nitrogen'] / (combined_df['Phosphorous'] + epsilon)
combined_df['N_K_Ratio'] = combined_df['Nitrogen'] / (combined_df['Potassium'] + epsilon)
combined_df['P_K_Ratio'] = combined_df['Phosphorous'] / (combined_df['Potassium'] + epsilon)
combined_df['Total_Nutrients'] = combined_df['Nitrogen'] + combined_df['Phosphorous'] + combined_df['Potassium']

# b) Climate Interaction Features (based on agronomy principles)
# Vapor Pressure Deficit (VPD) - a better measure of plant stress
es = 0.6108 * np.exp((17.27 * combined_df['Temparature']) / (combined_df['Temparature'] + 237.3))
ea = (combined_df['Humidity'] / 100) * es
combined_df['VPD'] = es - ea

# c) Soil-Climate Interactions
combined_df['Moisture_Temp_Interaction'] = combined_df['Moisture'] * combined_df['Temparature']
combined_df['Humidity_Moisture_Interaction'] = combined_df['Humidity'] * combined_df['Moisture']

print("New features created successfully.")


# --- 3. Preprocessing (Handling Categorical Features) ---
print("\n--- Starting Preprocessing ---")

# **FIX:** Use the correct column names with spaces
categorical_features = ['Soil Type', 'Crop Type']
print(f"Applying one-hot encoding to: {categorical_features}")

# Apply One-Hot Encoding to convert strings to numbers
combined_df = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)

# Separate back into training and testing sets
X_processed = combined_df.iloc[:len(train_df)]
X_test_processed = combined_df.iloc[len(train_df):]

print("New training data shape after feature engineering:", X_processed.shape)


# Encode the categorical target variable ('Fertilizer Name')
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)
print("Preprocessing complete.")


# --- 4. Model Training (LightGBM) ---
print("\n--- Starting Model Training ---")

lgbm = lgb.LGBMClassifier(objective='multiclass', random_state=42, n_estimators=500, learning_rate=0.05, num_leaves=31)

# Train the model on the entire processed training dataset
lgbm.fit(X_processed, y_encoded)
print("Model training complete.")


# --- 5. Prediction and Submission File Generation ---
print("\n--- Generating Predictions ---")

# Predict probabilities on the processed test set
test_probabilities = lgbm.predict_proba(X_test_processed)

# Get the indices of the top 3 predictions for each test sample (for MAP@3)
top_3_preds_indices = np.argsort(test_probabilities, axis=1)[:, ::-1][:, :3]

# Convert indices back to original fertilizer names
top_3_preds_labels = target_encoder.inverse_transform(top_3_preds_indices.flatten()).reshape(top_3_preds_indices.shape)

# Format predictions into a single space-delimited string
predictions_str = [' '.join(preds) for preds in top_3_preds_labels]

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predictions_str
})

# Save the submission file to the /kaggle/working/ directory
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully in /kaggle/working/.")
print("Submission file head:")
print(submission_df.head())
print("\nScript finished.")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading data...
Data loaded successfully.
Train shape: (750000, 10)
Test shape: (250000, 9)

--- Starting Advanced Feature Engineering ---
New features created successfully.

--- Starting Preprocessing ---
Applying one-hot encoding to: ['Soil Type', 'Crop Type']
New training data shape after feature engineering: (750000, 27)
Preprocessing complete.

--- Starting Model Training ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1823
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 27
[LightGBM] [Info] Start training from score -1.884866
[LightGBM] [Info] Start training from score -1.880057
[LightGBM] [Info] Start training from score -1.897538
[LightGBM] [Info] Start training from score -1.911544
[LightGBM] [Info] Start training from score -1