In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

warnings.filterwarnings('ignore')


In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        
        df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
        df['Surface_Area'] = df['Length'] * df['Diameter']
        df['L2_Dim'] = np.sqrt(df['Length']**2 + df['Diameter']**2 + df['Height']**2)
        
        df['Total_Component_Weight'] = df['Shucked Weight'] + df['Viscera Weight'] + df['Shell Weight']
        df['Water_Weight'] = df['Weight'] - df['Total_Component_Weight']
        
        epsilon = 1e-8
        df['Meat_Ratio'] = df['Shucked Weight'] / (df['Weight'] + epsilon)
        df['Shell_Ratio'] = df['Shell Weight'] / (df['Weight'] + epsilon)
        df['Viscera_Ratio'] = df['Viscera Weight'] / (df['Weight'] + epsilon)
        df['Density'] = df['Weight'] / (df['Volume'] + epsilon)
        df['Shell_to_Body'] = df['Shell Weight'] / (df['Shucked Weight'] + df['Viscera Weight'] + epsilon)
        df['Flatness'] = df['Height'] / (df['Length'] + epsilon)

        weight_cols = ['Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 
                       'Volume', 'Surface_Area', 'Total_Component_Weight', 'Water_Weight']
        
        for col in weight_cols:
            if col in df.columns:
                if col == 'Water_Weight':
                    df[f'log_{col}'] = np.log1p(df[col].clip(lower=0))
                else:
                    df[f'log_{col}'] = np.log1p(df[col])
        return df

In [None]:

df_train = pd.read_csv("/kaggle/input/phitron-contest/train.csv")
df_test = pd.read_csv("/kaggle/input/phitron-contest/test.csv")

target = 'Age'
drop_cols = ['id']

X = df_train.drop(columns=[target] + drop_cols)
y = df_train[target]
X_test = df_test.drop(columns=drop_cols)

In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Sex']),
        ('num', RobustScaler(), make_column_selector(dtype_include=np.number))
    ],
    verbose_feature_names_out=False
)

cat_params = {
    'iterations': 800,
    'learning_rate': 0.02,
    'depth': 7,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 42,
    'l2_leaf_reg': 10,
    'verbose': 0
}

model_pipeline = Pipeline([
    ('engineer', FeatureEngineer()),
    ('preprocess', preprocessor),
    ('model', CatBoostRegressor(**cat_params))
])

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model_pipeline.fit(X_train, y_train)
    
    oof_preds[val_idx] = model_pipeline.predict(X_val)
    test_preds += model_pipeline.predict(X_test) / kf.get_n_splits()
    
    score = mean_absolute_error(y_val, oof_preds[val_idx])
    print(f"Fold {fold+1} MAE: {score:.5f}")

print(f"Overall MAE: {mean_absolute_error(y, oof_preds):.5f}")

In [None]:
test_preds_rounded = np.round(test_preds).astype(int)

submission = pd.DataFrame({
    'id': df_test['id'],
    'Age': test_preds_rounded
})

submission.to_csv("submission.csv", index=False)