In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error


train = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')


X = train.drop(columns=['ID_LAT_LON_YEAR_WEEK', 'emission'])
y = train['emission']


cat_params = {
    'n_estimators': 4200,
    'learning_rate': 0.1,
    'depth': 6,
    'l2_leaf_reg': 5.5,
    'subsample': 0.7,
    'colsample_bylevel': 0.8,
    'min_data_in_leaf': 53,
    'random_state': 42
}


predictions = np.zeros(test.shape[0])


years = [2019, 2020, 2021]

for year in years:
    
    X_train = X[X['year'] != year]
    y_train = y[X['year'] != year]
    X_val = X[X['year'] == year]
    y_val = y[X['year'] == year]

    
    model = CatBoostRegressor(**cat_params)
    model.fit(X_train, y_train, verbose=False)

    
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f'Year {year} - Validation RMSE: {rmse}')

    
    year_preds = model.predict(test.drop(columns='ID_LAT_LON_YEAR_WEEK'))
    predictions += year_preds / len(years)


sample_submission['emission'] = predictions
sample_submission.to_csv('model_1_submission.csv', index=False)