In [234]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,log_loss
from xgboost import XGBClassifier


In [235]:
train=pd.read_csv("/kaggle/input/mle-ese-mock/train (5).csv")
test=pd.read_csv("/kaggle/input/mle-ese-mock/test (4).csv")

In [236]:
train.isnull().sum()

id                   0
fruit_type           0
weight_g             0
diameter_cm          0
color_grade          0
firmness_score       0
brix_level           0
acidity_ph           0
ripeness_index       0
bruise_area_pct      0
moisture_pct         0
shelf_life_days      0
harvest_age_days     0
storage_type         0
origin_region        0
quality_grade       12
dtype: int64

In [237]:
test.isnull().sum()

id                  0
fruit_type          0
weight_g            0
diameter_cm         0
color_grade         0
firmness_score      0
brix_level          0
acidity_ph          0
ripeness_index      0
bruise_area_pct     0
moisture_pct        0
shelf_life_days     0
harvest_age_days    0
storage_type        0
origin_region       0
dtype: int64

In [238]:
test_id=test['id']
test=test.drop(columns=['id'])

In [239]:
train=train.dropna(subset=['quality_grade'])

In [240]:
train.isnull().sum()

id                  0
fruit_type          0
weight_g            0
diameter_cm         0
color_grade         0
firmness_score      0
brix_level          0
acidity_ph          0
ripeness_index      0
bruise_area_pct     0
moisture_pct        0
shelf_life_days     0
harvest_age_days    0
storage_type        0
origin_region       0
quality_grade       0
dtype: int64

In [241]:
X=train.drop(columns=['id','quality_grade'])
y=train['quality_grade']

In [242]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [243]:
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

In [244]:
numerical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [245]:
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

In [246]:
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(
    loss="log_loss",        # IMPORTANT
    max_iter=100,
    learning_rate=0.05,

    max_depth=6,
    min_samples_leaf=30,

    l2_regularization=0.1,
    max_bins=255,

    random_state=42
)


In [247]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('model',model)
])

In [248]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # fit on train
y_test_enc = le.transform(y_test)        # transform test

In [249]:
pipeline.fit(X_train,y_train_enc)

In [250]:
y_proba=pipeline.predict_proba(X_test)

In [251]:
loss = log_loss(y_test_enc, y_proba)
print("Log Loss:", loss)

Log Loss: 1.3519547371663836


In [252]:
y_final=pipeline.predict_proba(test)

In [253]:
class_names = le.classes_  # use label encoder mapping
submission = pd.DataFrame(y_final, columns=[f"Status_{cls}" for cls in class_names])
submission.insert(0, 'id', test_id)
submission.to_csv("submission4.csv", index=False)
print("\n✅ Submission file created successfully!")
print(submission.head())


✅ Submission file created successfully!
   id  Status_Q10_waste  Status_Q1_premium_fresh  Status_Q2_fresh  \
0   0          0.000337                 0.002329         0.138444   
1   1          0.830871                 0.000243         0.000268   
2   2          0.000492                 0.535073         0.377961   
3   3          0.001269                 0.019338         0.011465   
4   4          0.024323                 0.000438         0.000444   

   Status_Q3_export_grade  Status_Q4_dessert  Status_Q5_juice_high  \
0                0.280532           0.506172              0.063635   
1                0.000237           0.000266              0.000341   
2                0.078272           0.004475              0.001569   
3                0.693081           0.119576              0.147792   
4                0.000647           0.000689              0.003692   

   Status_Q6_juice_low  Status_Q7_processing  Status_Q8_local_sale  \
0             0.007293              0.000587         