In [None]:
from prophet import Prophet
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error

from catboost import CatBoostRegressor, Pool
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
import warnings


pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

In [None]:
holidays = pd.read_csv('/kaggle/input/gdz-elektrik-datathon-2024/holidays.csv')
sub = pd.read_csv('/kaggle/input/gdz-elektrik-datathon-2024/sample_submission.csv')
test = pd.read_csv('/kaggle/input/gdz-elektrik-datathon-2024/test.csv')
train = pd.read_csv('/kaggle/input/gdz-elektrik-datathon-2024/train.csv')
weather = pd.read_csv('/kaggle/input/gdz-elektrik-datathon-2024/weather.csv')

In [None]:
weather.head()

In [None]:
cols_to_shift = ['bildirimli_sum']
#,'t_2m-C_mean''effective_cloud_cover-p_mean', 'global_rad-W_mean', 'relative_humidity_2m-p_mean', 'wind_dir_10m-d_mean', 'wind_speed_10m-ms_mean', 'prob_precip_1h-p_mean', 't_apparent-C_mean']


cols_to_roll = ['bildirimli_sum', 't_2m-C_mean', 'relative_humidity_2m-p_mean', 'prob_precip_1h-p_mean', 't_apparent-C_mean']

cols_to_cluster = ['bildirimli_sum']

lags = np.arange(0,51,5)[1:]

rolls = np.arange(0,48,6)[1:]

In [None]:
data = pd.concat([train, test], ignore_index=True)

In [None]:
holidays.rename(columns={'Yıl': 'year', 'Ay': 'month', 'Gün': 'day'}, inplace=True)

holidays['tarih'] = pd.to_datetime(holidays[['year', 'month', 'day']])

holidays = holidays.drop(['year', 'month', 'day'], axis=1)

holidays.head()

In [None]:
data['tarih'] = pd.to_datetime(data['tarih'])

merged_df = pd.merge(data, holidays, on='tarih', how='left')

merged_df['Tatil Adı'] = merged_df['Tatil Adı'].fillna('Not a Holiday')

merged_df.head()

In [None]:
data['tarih'] = pd.to_datetime(data['tarih'])
weather['date'] = pd.to_datetime(weather['date'])

# Extract date and hour
weather['date_only'] = weather['date'].dt.date
weather['hour_only'] = weather['date'].dt.hour
weather = weather.drop(['date'], axis=1)
weather['name'] = weather['name'].str.lower()
weather.rename(columns={'date_only': 'date', 'name':'ilce'}, inplace=True)

In [None]:
# Aggregate weather data by date and district to daily averages
daily_weather = weather.groupby(['date', 'ilce']).agg({
    't_2m:C': ['mean', 'std', 'min', 'max', 'sum'],
    'effective_cloud_cover:p': ['mean', 'std', 'min', 'max', 'sum'],
    'global_rad:W': ['mean', 'std', 'min', 'max', 'sum'],
    'relative_humidity_2m:p': ['mean', 'std', 'min', 'max', 'sum'],
    'wind_dir_10m:d': ['mean', 'std', 'min', 'max', 'sum'],
    'wind_speed_10m:ms': ['mean', 'std', 'min', 'max', 'sum'],
    'prob_precip_1h:p': ['mean', 'std', 'min', 'max', 'sum'],
    't_apparent:C': ['mean', 'std', 'min', 'max', 'sum'],
}).reset_index()



daily_weather.columns = ['_'.join(col).strip() for col in daily_weather.columns.values]

# Convert 'date' back to datetime to match with merged_df's 'tarih'
daily_weather['date_'] = pd.to_datetime(daily_weather['date_'])

# Merge with the previous merged data (merged_df)
final_merged_df = pd.merge(merged_df, daily_weather, left_on=['tarih', 'ilce'], right_on=['date_', 'ilce_'], how='left')

# Drop redundant 'date' column
final_merged_df = final_merged_df.drop('date_', axis=1)
final_merged_df = final_merged_df.drop('ilce_', axis=1)
# 
# Display the first few rows of the final merged data
final_merged_df.head()

In [None]:
def time_features(df, time_col='tarih'):

    df_ = df.copy()

#     df_["hour"] = df_[time_col].dt.hour
#     df_['day'] = df_[time_col].dt.day
    df_["year"] = df_[time_col].dt.year
    df_["month"] = df_[time_col].dt.month
    df_["quarter"] = df_[time_col].dt.quarter
    df_["dayofweek"] = df_[time_col].dt.dayofweek
    df_["dayofyear"] = df_[time_col].dt.dayofyear
    df_["dayofmonth"] = df_[time_col].dt.day
    df_["weekofyear"] = df_[time_col].dt.isocalendar().week
    df_["season"] = df_["month"] % 12 // 3 + 1

    # Categorical Features
    df_["is_weekend"] = df_[time_col].apply(lambda x: x.weekday() > 4).astype(int)
#     df_["is_night"] = df_["hour"].apply(lambda x: x in [0, 1, 2, 3, 4, 5]).astype(int)
    df_['is_month_start'] = df_[time_col].dt.is_month_start.astype(int)
    df_['is_month_end'] = df_[time_col].dt.is_month_end.astype(int)
    df_['is_quarter_start'] = df_[time_col].dt.is_quarter_start.astype(int)
    df_['is_quarter_end'] = df_[time_col].dt.is_quarter_end.astype(int)
    df_['is_year_start'] = df_[time_col].dt.is_year_start.astype(int)
    df_['is_year_end'] = df_[time_col].dt.is_year_end.astype(int)

    # Cyclic Features
    df_["dayofyear_sin"] = np.sin(2 * np.pi * df_["dayofyear"] / df_["dayofyear"].max())
    df_["dayofyear_cos"] = np.cos(2 * np.pi * df_["dayofyear"] / df_["dayofyear"].max())
    df_["month_sin"] = np.sin(2 * np.pi * df_["month"] / df_["month"].max())
    df_["month_cos"] = np.cos(2 * np.pi * df_["month"] / df_["month"].max())

#     df_["weekofyear_sin"] = np.sin(2 * np.pi * df_["weekofyear"] / df_["weekofyear"].max()) ####
#     df_["weekofyear_cos"] = np.cos(2 * np.pi * df_["weekofyear"] / df_["weekofyear"].max())#######

#     df_["dayofweek_sin"] = np.sin(2 * np.pi * df_["dayofweek"] / 7)###########
#     df_["dayofweek_cos"] = np.cos(2 * np.pi * df_["dayofweek"] / 7)############

#     df_["quarter_sin"] = np.sin(2 * np.pi * df_["quarter"] / 4)#################
#     df_["quarter_cos"] = np.cos(2 * np.pi * df_["quarter"] / 4)################

    df_['year_season'] = df_['year'].astype(str) + "_" + df_['season'].astype(str)

    return df_

def lag_features(
    dataframe: pd.DataFrame, cols: list, lags: list, fillna: bool = False
) -> pd.DataFrame:
    df_ = dataframe.copy()
    for col in cols:
        for lag in lags:
            df_[f"{col}_{lag}_lag"] = df_[col].shift(lag)
            if fillna:
                df_[f"{col}_{lag}_lag"] = df_[f"{col}_{lag}_lag"].fillna(method="bfill")
                df_[f"{col}_{lag}_lag"] = df_[f"{col}_{lag}_lag"].fillna(method="ffill")
    return df_


def roll_features(
    dataframe: pd.DataFrame, cols: list, windows: list, fillna: bool = False
):
    df_ = dataframe.copy()
    for col in cols:
        for w in windows:
            df_[f"rolling_{w}_mean_{col}"] = df_[col].rolling(w, min_periods=1).mean()
            df_[f"rolling_{w}_std_{col}"] = df_[col].rolling(w, min_periods=1).std()
            df_[f"rolling_{w}_min_{col}"] = df_[col].rolling(w, min_periods=1).min()
            df_[f"rolling_{w}_max_{col}"] = df_[col].rolling(w, min_periods=1).max()
            if fillna:
                df_[f"rolling_{w}_mean_{col}"] = df_[f"rolling_{w}_mean_{col}"].fillna(
                    method="bfill"
                )
                df_[f"rolling_{w}_std_{col}"] = df_[f"rolling_{w}_std_{col}"].fillna(
                    method="bfill"
                )
                df_[f"rolling_{w}_min_{col}"] = df_[f"rolling_{w}_min_{col}"].fillna(
                    method="bfill"
                )
                df_[f"rolling_{w}_max_{col}"] = df_[f"rolling_{w}_max_{col}"].fillna(
                    method="bfill"
                )
                df_[f"rolling_{w}_mean_{col}"] = df_[f"rolling_{w}_mean_{col}"].fillna(
                    method="ffill"
                )
                df_[f"rolling_{w}_std_{col}"] = df_[f"rolling_{w}_std_{col}"].fillna(
                    method="ffill"
                )
                df_[f"rolling_{w}_min_{col}"] = df_[f"rolling_{w}_min_{col}"].fillna(
                    method="ffill"
                )
                df_[f"rolling_{w}_max_{col}"] = df_[f"rolling_{w}_max_{col}"].fillna(
                    method="ffill"
                )

    return df_


def ewma(dataframe: pd.DataFrame, cols: list, lags: list, fillna: bool = True) -> pd.DataFrame:
    df_ = dataframe.copy()
    for col in cols:
        for lag in lags:
            df_[f"{col}_{lag}_lag_ewm"] = df_[col].shift(lag).ewm(alpha=0.95).mean()
            if fillna:
                df_[f"{col}_{lag}_lag_ewm"] = df_[f"{col}_{lag}_lag"].fillna(method="bfill")
                df_[f"{col}_{lag}_lag_ewm"] = df_[f"{col}_{lag}_lag"].fillna(method="ffill")
    return df_

def feature_engineering(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()

    ilce_grouped_yearly = df_.groupby(['ilce','year'])
    df_['bildirimli_sum_mean_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('mean')
    df_['bildirimli_sum_std_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('std')
    df_['bildirimli_sum_var_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('var')
    df_['bildirimli_sum_median_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('median')
    df_['bildirimli_sum_min_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('min')
    df_['bildirimli_sum_max_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('max')
    df_['bildirimli_sum_skew_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform('skew')
    df_['bildirimli_sum_25th_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform(lambda x: x.quantile(0.25))
    df_['bildirimli_sum_75th_yearly'] = ilce_grouped_yearly['bildirimli_sum'].transform(lambda x: x.quantile(0.75))
    df_['bildirimli_sum_range'] = ilce_grouped_yearly['bildirimli_sum'].transform(lambda x: x.max() - x.min())
#     df_['bildirimli_sum_iqr'] = ilce_grouped_yearly['bildirimli_sum'].transform(lambda x: x.quantile(0.75) - x.quantile(0.25))
    
    df_.columns = df_.columns.str.replace(':', '-')

    df_['high_wind_flag'] = (df_['wind_speed_10m-ms_mean'] > 6).astype(int)

    df_['extreme_high_temp_flag'] = (df_['t_2m-C_mean'] > 35).astype(int) ################
    df_['extreme_low_temp_flag'] = (df_['t_2m-C_mean'] < 0).astype(int)

    df_['temp_humid'] = df_['t_2m-C_mean'] * df_['relative_humidity_2m-p_mean']
#     df_['temp_apparent_humid'] = df_['t_apparent-C_mean'] * df_['relative_humidity_2m-p_mean']
    df_['wind_precip'] = df_['wind_speed_10m-ms_mean'] * df_['prob_precip_1h-p_mean']
    df_['cloud_precip'] = df_['effective_cloud_cover-p_mean']* df_['prob_precip_1h-p_mean']

    df_['wind_dir_sin'] = np.sin(1 * 2 * np.pi * df_['wind_dir_10m-d_mean'] / 360)
    df_['wind_dir_cos'] = np.cos(1 * 2 * np.pi * df_['wind_dir_10m-d_mean'] / 360)

    df_["WindSpeed_sin_component"] = df_["wind_speed_10m-ms_mean"] * df_["wind_dir_sin"]
    df_["WindSpeed_cos_component"] = df_["wind_speed_10m-ms_mean"] * df_["wind_dir_cos"]

    # df_[['city', 'district']] = df_['ilce'].str.split('-', expand=True)

    return df_

In [None]:
def convert_dtypes(df):

    df_ = df.copy()

    for col in df_.select_dtypes(include=['object']).columns:
        df_[col] = df_[col].astype('category')

#     for col in df_.select_dtypes(include=['float64']).columns:
#         df_[col] = df_[col].astype('float32')

#     for col in df_.select_dtypes(include=['int64']).columns:
#         df_[col] = df_[col].astype('int16')

    return df_

In [None]:
inal_merged_df = time_features(final_merged_df)

final_merged_df = feature_engineering(final_merged_df)

final_merged_df = lag_features(final_merged_df, cols_to_shift, lags)

# final_merged_df = ewma(final_merged_df, cols_to_roll, lags)

# final_merged_df = roll_features(final_merged_df, cols_to_roll, rolls)

# final_merged_df = expand_features(final_merged_df, cols_to_shift)

final_merged_df = convert_dtypes(final_merged_df)

In [None]:
categorical_cols = final_merged_df.select_dtypes(include=['category']).columns
categorical_cols = categorical_cols.tolist()

In [None]:
train = final_merged_df.loc[(final_merged_df['tarih'] < '2024-02-01')]
test = final_merged_df.loc[(final_merged_df['tarih'] > '2024-01-31')]

val = train.loc[(train['tarih'] > '2024-01-04')]
train = train.loc[(train['tarih'] < '2024-01-05')]
train.tail()

In [None]:
cb_params_42 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 42,
            'task_type': "CPU",
            'verbose': 1000,
            'depth': 5
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

cb_params_0 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 0,
            'task_type': "CPU",
            'verbose': 1000,
            'depth': 4
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

cb_params_1 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 1,
            'task_type': "CPU",
            'verbose': 1000,
            'depth': 8
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

cb_params_986 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 986,
            'task_type': "CPU",
            'verbose': 1000,
            'depth': 7
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

cb_params_1073 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 1073,
            'task_type': "CPU",
            'verbose': 1000
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

cb_params_527 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 527,
            'task_type': "CPU",
            'verbose': 1000
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }


cb_params_333 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 333,
            'task_type': "CPU",
            'verbose': 1000
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }



cb_params_2 = {
             'iterations': 10000,
             'eval_metric': 'MAPE',
            'loss_function': 'MAPE',
            'early_stopping_rounds': 1000,
            'use_best_model': True,
            'random_state': 2,
            'task_type': "CPU",
            'verbose': 1000
#             'learning_rate': 0.29017630492476376, 
#              'depth': 4, 
#              'l2_leaf_reg': 7.2678866855697075,
#              'bagging_temperature': 0.6339522226899619,
#              'border_count': 108, 
#              'min_data_in_leaf': 68,
          }

In [None]:
numerical_cols = train.select_dtypes(include=['number']).columns.tolist()

# Calculate the correlation matrix for all numerical columns
full_correlation_matrix = train[numerical_cols].corr()

full_correlation_matrix

In [None]:

param_sets = {
    'cb0': cb_params_0,
    'cb1': cb_params_1,
    'cb42': cb_params_42,
    'cb986': cb_params_986,
    'cb1073': cb_params_1073,
    'cb527': cb_params_527,
    'cb333': cb_params_333,
    'cb2': cb_params_2,
}

all_test_results = {}
all_preds = []

for param_name, params in param_sets.items():
    test_all_dates = pd.DataFrame()
    all_test_dates = []
    val_scores = []
    district_list = []

    for ilce in train['ilce'].unique():
        df_train = train[train['ilce'] == ilce]
        df_val = val[val['ilce'] == ilce]
        df_test = test[test['ilce'] == ilce]

        test_dates = df_test[['tarih']]
        print(f'Training with parameters: {param_name}')
        print('train:', df_train.shape)
        print('val:', df_val.shape)
        print('test:', df_test.shape)
        print('ilce:', ilce)

        X_train = df_train.drop(columns=['bildirimsiz_sum'], axis=1)
        X_val = df_val.drop(columns=['bildirimsiz_sum'], axis=1)
        X_test = df_test.drop(columns=['bildirimsiz_sum'], axis=1)

        y_train = df_train['bildirimsiz_sum']
        y_val = df_val['bildirimsiz_sum']

        cat_features = [X_train.columns.get_loc(col) for col in categorical_cols]

        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        validate_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostRegressor(**params)
        model.fit(train_pool, eval_set=validate_pool, use_best_model=True)

        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

        test_dates['y_pred'] = test_pred

        val_mape = mean_absolute_percentage_error(y_val + 1, val_pred + 1)

        print(f'VALIDATION SCORE: {val_mape}')
        print(test_dates.head())

        all_test_dates.append(test_dates)
        val_scores.append(val_mape)

        print("-" * 80)

    test_all_dates = pd.concat(all_test_dates)
    test_all_dates = test_all_dates.sort_values("tarih")
    test_all_dates.sort_index(inplace=True)
    preds = test_all_dates["y_pred"].values

    all_test_results[param_name] = {
        'test_dates': test_all_dates,
        'val_scores': val_scores,
        'avg_val_score': np.mean(val_scores),
        'avg_forecasts': np.mean(preds),
        'district_list': district_list
    }

    all_preds.append(preds)

    print(f"CATBOOST Forecasts AVG: {np.mean(preds)}")
    print(f"CATBOOST VALIDATION SCORE AVG:{np.mean(val_scores)}")
    print(f"CATBOOST TAHMIN EDILEMEYEN ILCE LISTESI:{district_list}")

# Calculate the final predictions as the weighted average of all individual model predictions
weights = 0.125
final_preds = np.sum([weights * pred for pred in all_preds], axis=0)

print(f"Final Predictions AVG: {np.mean(final_preds)}")

# Display the results for each parameter set
for param_name, results in all_test_results.items():
    print(f"Results for {param_name}:")
    print(f"Average Validation Score: {results['avg_val_score']}")
    print(f"Average Forecasts: {results['avg_forecasts']}")
    print(f"Test Dates Head:\n{results['test_dates'].head()}")
    print("-" * 80)


In [None]:
sub['bildirimsiz_sum'] = final_preds
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)
