In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

# Load the data
train = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')

# Define the number of clusters for KMeans
n_clusters = 7

# KMeans clustering
km = KMeans(n_clusters=n_clusters)
km_train = train.groupby(['latitude', 'longitude'], as_index=False)['emission'].mean()
km.fit(km_train[['latitude', 'longitude']])
km_train['kmeans_group'] = km.predict(km_train[['latitude', 'longitude']])
train = train.merge(km_train[['latitude', 'longitude', 'kmeans_group']], on=['latitude', 'longitude'], how='left')
test = test.merge(km_train[['latitude', 'longitude', 'kmeans_group']], on=['latitude', 'longitude'], how='left')

# Number of NANS
train['n_nans'] = train.isna().sum(axis=1)
test['n_nans'] = test.isna().sum(axis=1)

# Ratio of 2020 emissions
dt1 = train[train.year == 2020].groupby(['week_no'], as_index=False)['emission'].mean()
dt2 = train[train.year != 2020].groupby(['week_no'], as_index=False)['emission'].mean()
dt = dt1['emission'] / dt2['emission']
train['ratio_2020'] = 1
train.loc[train['year'] == 2020, 'ratio_2020'] = train[train['year'] == 2020].week_no.map(dt)
test['ratio_2020'] = 1

# Split features and target
X = train.drop(columns=['ID_LAT_LON_YEAR_WEEK', 'emission'])
y = train['emission']

# Define CatBoost parameters
cat_params = {
    'n_estimators': 4200,
    'learning_rate': 0.13640770518541556,
    'depth': 6,
    'l2_leaf_reg': 5.5,
    'subsample': 0.6827434356780697,
    'colsample_bylevel': 0.7796045453742051,
    'min_data_in_leaf': 53,
    'random_state': 42
}

# Placeholder for predictions
predictions = np.zeros(test.shape[0])

# We are going to perform a "leave one year out" validation
years = [2019, 2020, 2021]

for year in years:
    # Split the data into train and validation sets
    X_train = X[X['year'] != year]
    y_train = y[X['year'] != year]
    X_val = X[X['year'] == year]
    y_val = y[X['year'] == year]

    # Instantiate and train the model
    model = CatBoostRegressor(**cat_params, cat_features=['kmeans_group'])
    model.fit(X_train, y_train, verbose=False)

    # Evaluate the model
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f'Year {year} - Validation RMSE: {rmse}')

    # Get predictions for the test set
    year_preds = model.predict(test.drop(columns='ID_LAT_LON_YEAR_WEEK'))
    predictions += year_preds / len(years)

# Create submission file
sample_submission['emission'] = predictions
sample_submission.to_csv('model_2_submission.csv', index=False)
