In [None]:
!unzip /content/playground-series-s5e5.zip

Archive:  /content/playground-series-s5e5.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import KBinsDiscretizer
import warnings
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
df = pd.read_csv("/content/train.csv")
df = df.drop('id',axis = 1)
df_test = pd.read_csv('/content/test.csv')
df_test = df_test.drop('id',axis=1)
df = df.drop_duplicates()

In [None]:
numeric_cols = ['Age','Height','Weight','Duration','Heart_Rate','Body_Temp','BMI','Intensity']
def feature_engineering(df : pd.DataFrame, numeric_cols: list) -> pd.DataFrame:

    # sqrt for all features
    for i in range(len(numeric_cols)):
        df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
        df['Intensity'] = df['Heart_Rate'] / df['Duration']
        feature_1 = numeric_cols[i]
        for j in range(i+1,len(numeric_cols)):
            feature_2 = numeric_cols[j]
            df[f'{feature_1}_x_{feature_2}'] = df[feature_1] * df[feature_2]

    return df

In [None]:
label_enc = LabelEncoder()
df['Sex'] = label_enc.fit_transform(df['Sex'])
df_test['Sex'] = label_enc.transform(df_test['Sex'])

In [None]:
train = feature_engineering(df,numeric_cols)
test = feature_engineering(df_test,numeric_cols)
train["Sex"] = train["Sex"].astype("category")
test["Sex"] = test["Sex"].astype("category")

In [None]:
X = train.drop(['Calories'], axis = 1 )
y = np.log1p(train["Calories"])

In [None]:
FOLDS = 40
KF = KFold(n_splits=FOLDS, shuffle = True, random_state = 42)
cat_features = ['Sex']
oof_cat = np.zeros(len(train))
pred_cat = np.zeros(len(test))
oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))


cat_model = CatBoostRegressor(
    iterations= 3500,
    learning_rate= 0.02,
    depth= 12,
    loss_function= 'RMSE',
    l2_leaf_reg= 3,
    random_seed= 42,
    eval_metric= 'RMSE',
    early_stopping_rounds = 200,
    verbose= 1000,
    task_type= 'GPU')


xgb_model = XGBRegressor(
    max_depth=10,
    colsample_bytree=0.75,
    subsample=0.9,
    n_estimators=2000,
    learning_rate=0.01,
    gamma=0.01,
    max_delta_step=2,
    early_stopping_rounds=100,
    eval_metric="rmse",
    enable_categorical=True,
    device = 'cuda')

for i, (train_idx,valid_idx) in enumerate(KF.split(X,y)):
    print('#' * 15, i+1, '#' *15)

    X_train,y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]


    cat_model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],cat_features=cat_features,
            use_best_model=True,verbose=0)

    xgb_model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],verbose=0)

    oof_cat[valid_idx] = cat_model.predict(X_valid)
    pred_cat += cat_model.predict(test)

    oof_xgb[valid_idx] = xgb_model.predict(X_valid)
    pred_xgb += xgb_model.predict(test)

    cat_rmse = mean_squared_error(y_valid,oof_cat[valid_idx]) ** 0.5
    xgb_rmse = mean_squared_error(y_valid, oof_xgb[valid_idx]) ** 0.5

    print(f'FOLD {i+1} CATBOOST_RMSE = {cat_rmse:.4f} <=> XGB_RMSE = {xgb_rmse:.4f}')

############### 1 ###############
FOLD 1 CATBOOST_RMSE = 0.0608 <=> XGB_RMSE = 0.0626
############### 2 ###############
FOLD 2 CATBOOST_RMSE = 0.0570 <=> XGB_RMSE = 0.0568
############### 3 ###############
FOLD 3 CATBOOST_RMSE = 0.0582 <=> XGB_RMSE = 0.0577
############### 4 ###############
FOLD 4 CATBOOST_RMSE = 0.0626 <=> XGB_RMSE = 0.0619
############### 5 ###############
FOLD 5 CATBOOST_RMSE = 0.0589 <=> XGB_RMSE = 0.0581
############### 6 ###############
FOLD 6 CATBOOST_RMSE = 0.0598 <=> XGB_RMSE = 0.0604
############### 7 ###############
FOLD 7 CATBOOST_RMSE = 0.0594 <=> XGB_RMSE = 0.0594
############### 8 ###############
FOLD 8 CATBOOST_RMSE = 0.0642 <=> XGB_RMSE = 0.0618
############### 9 ###############
FOLD 9 CATBOOST_RMSE = 0.0589 <=> XGB_RMSE = 0.0595
############### 10 ###############
FOLD 10 CATBOOST_RMSE = 0.0596 <=> XGB_RMSE = 0.0594
############### 11 ###############
FOLD 11 CATBOOST_RMSE = 0.0628 <=> XGB_RMSE = 0.0637
############### 12 ###############
FOLD 12 CATBOOS

In [None]:
pred_cat /= FOLDS
pred_xgb /= FOLDS

In [None]:
y_preds = np.expm1(pred_cat) * 0.55 + np.expm1(pred_xgb)*0.45
y_preds = np.clip(y_preds, 1, 314)

# Save submission
submission = pd.read_csv('/content/sample_submission.csv')
submission["Calories"] = y_preds
submission.to_csv("submission.csv", index=False)
print('submission saved')
submission.head()

submission saved


Unnamed: 0,id,Calories
0,750000,27.697466
1,750001,107.686374
2,750002,87.661617
3,750003,125.24723
4,750004,75.981221
