In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

# ==========================================
# GLOBAL WARNING CONTROL (SAFE & CLEAN)
# ==========================================
import warnings

# 1️⃣ Ignore known, harmless FutureWarnings (seaborn / pandas)
warnings.filterwarnings(
    "ignore",
    category=FutureWarning
)

# 2️⃣ Ignore pandas RuntimeWarnings from NaN comparisons
warnings.filterwarnings(
    "ignore",
    category=RuntimeWarning,
    module="pandas"
)

# 3️⃣ Ignore seaborn warnings (visualization only)
warnings.filterwarnings(
    "ignore",
    module="seaborn"
)

# 4️⃣ Safety: ensure numpy doesn't spam invalid comparisons
np.seterr(invalid='ignore')

train=pd.read_csv("/kaggle/input/final-everything/train.csv")
test=pd.read_csv("/kaggle/input/final-everything/test.csv")

train.isnull().sum()

test.isnull().sum()

train = train.dropna(subset=['fruit_name']) #output label

test_id=test['id']
test=test.drop(columns=['id'])

X=train.drop(columns=['fruit_name'])
y=train['fruit_name']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object','category']).columns

numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])

#VISUALISAION
# STEP 1: HISTPLOT
# ==========================================
print("Step 1: Histplots")
for col in numeric_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(X[col].dropna(), kde=True, color='royalblue')
    plt.show()

# ==========================================
# STEP 2: TARGET COUNTS
# ==========================================
print("\nStep 2: Target Counts")
print(y.value_counts())
sns.countplot(x=y)
plt.show()

# ==========================================
# STEP 3: BOXPLOT (Before)
# ==========================================
print("\nStep 3: Boxplots (Before)")
for col in numeric_features:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=X[col], color='tomato')
    plt.show()
    
# ==========================================
# STEP 4: OUTLIER & INF HANDLING (ROBUST VERSION)
# ==========================================
print("\nStep 4: Handling Outliers & Converting Infinity to NaN")

# 1️⃣ Replace inf → NaN
X_train[numeric_features] = X_train[numeric_features].replace([np.inf, -np.inf], np.nan)
X_test[numeric_features]  = X_test[numeric_features].replace([np.inf, -np.inf], np.nan)

# 2️⃣ Compute IQR on TRAIN
Q1 = X_train[numeric_features].quantile(0.25)
Q3 = X_train[numeric_features].quantile(0.75)
IQR = Q3 - Q1

# 3️⃣ Keep only valid columns (IQR > 0 and not NaN)
valid_cols = IQR[(IQR > 0) & (~IQR.isna())].index

# 4️⃣ Clip only valid columns
lower = Q1[valid_cols] - 1.5 * IQR[valid_cols]
upper = Q3[valid_cols] + 1.5 * IQR[valid_cols]

X_train[valid_cols] = X_train[valid_cols].clip(lower, upper, axis=1)
X_test[valid_cols]  = X_test[valid_cols].clip(lower, upper, axis=1)


# ==========================================
# STEP 5: RE-CHECK TARGET COUNTS
# ==========================================
print(f"Total Unique Classes: {y.nunique()}")
print("-" * 30)
print(y.value_counts())

# ==========================================
# STEP 6: RE-CHECK BOXPLOTS (AFTER CLEANING)
# ==========================================
print("\nStep 6: Final Visual Checks")

for col in numeric_features:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=X_train[col], color='limegreen')
    plt.title(f"Boxplot of {col}")
    plt.show()
    
# ==========================================
# STEP 7: NORMAL PAIRPLOT
# ==========================================
print("\nStep 7: Generating Normal Pairplot")

plot_df = pd.concat([X_train, y_train], axis=1).sample(
    min(500, len(X_train)),
    random_state=42
)

sns.pairplot(plot_df, hue='fruit_name')
plt.show()

# ==========================================
# STEP 8: HEATMAP (Fixed)
# ==========================================
print("\nStep 8: Corrected Heatmap")
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(numeric_only=True), annot=True, cmap='RdYlBu', center=0, square=True)
plt.show()

from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

pipeline=Pipeline(steps=[
    ('preprocessor',preprocessing),
    ('model',model)
])

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # fit on train
y_test_enc = le.transform(y_test)        # transform test

pipeline.fit(X_train,y_train_enc)

# Predict class labels
y_pred = pipeline.predict(X_test)
# Predict class probabilities (needed for log-loss, AUC, calibration)
y_pred_proba = pipeline.predict_proba(X_test)

# Accuracy
acc = accuracy_score(y_test_enc, y_pred)

# Log Loss
ll = log_loss(y_test_enc, y_pred_proba)

# Precision, Recall, F1 (weighted = handles class imbalance)
prec = precision_score(y_test_enc, y_pred, average='weighted')
rec  = recall_score(y_test_enc, y_pred, average='weighted')
f1   = f1_score(y_test_enc, y_pred, average='weighted')

print("Accuracy :", acc)
print("Log Loss :", ll)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)


final_preds=pipeline.predict(test)
final_probs=pipeline.predict_proba(test)

# ==========================================
# STEP 10: FINAL DATA PREPARATION
# (NO ROW DROPPED, NO NaNs INTRODUCED)
# ==========================================
print("\nStep 10: Preparing Final Submission DataFrames...")
# Decode predicted labels
decoded_labels = le.inverse_transform(final_preds)
# Highest confidence per row
highest_probs = np.max(final_probs, axis=1)
class_names = le.classes_
# ==========================================
# SUBMISSION 1: ID + PREDICTED CLASS
# ==========================================
submission1_df = pd.DataFrame({
    'id': test_id,
    'fruit_name': decoded_labels
})

# ==========================================
# SUBMISSION 2: ID + ALL CLASS PROBABILITIES log loss jaisa
# ==========================================
prob_cols = {
    f"Status_{cls}": final_probs[:, i]
    for i, cls in enumerate(class_names)
}

submission2_df = pd.DataFrame(prob_cols)
submission2_df.insert(0, 'id', test_id)

# ==========================================
# SUBMISSION 3: ID + CLASS + CONFIDENCE random sa kuch to hai
# ==========================================
submission3_df = pd.DataFrame({
    'id': test_id,
    'Predicted_Class': decoded_labels,
    'Confidence_Score': highest_probs
})


# ==========================================
# STEP 11: EXPORT FILES 1- for prediction
# 2- log loss styled
# 3- just all the probablity in one line
# ==========================================

submission1_df.to_csv("submission1.csv", index=False)
submission2_df.to_csv("submission2.csv", index=False)
submission3_df.to_csv("submission3.csv", index=False)
print("All submissions are generated")