In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import pickle
from numpy import median
import lightgbm as lgb
import os
import enefit
from sklearn.ensemble import GradientBoostingRegressor
import glob

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def load_data():
    calendar = pd.read_csv('/kaggle/input/my-data/calendar.csv', keep_default_na=False)
    gps_point = pd.read_csv('/kaggle/input/my-data/gps_point.csv', keep_default_na=False)
    code_list = pd.read_csv('/kaggle/input/my-data/code_list.csv', keep_default_na=False)
    return calendar, gps_point, code_list

In [4]:
def preprocess_forecast(forecast, gps_point):
    forecast['lat/long'] = forecast['latitude'].astype(str) + ',' + forecast['longitude'].astype(str)
    selected_gps = gps_point.loc[gps_point['main_gps_county'] == 1, 'lat/long'].tolist()
    forecast = forecast[forecast['lat/long'].isin(selected_gps)]
    forecast = pd.merge(forecast, gps_point[['lat/long', 'county_code']], on='lat/long', how='left')
    forecast['county_code'] = forecast['county_code'].astype('int64')
    return forecast

In [5]:
def convert_dates(df):
    for column in df.columns:
        if column.lower().startswith('date') or '_date' in column.lower():
            df[column] = df[column].astype('int64')
            df[column] /= 10**9 
            df[column] = df[column].apply(datetime.datetime.fromtimestamp)
            df[column] = pd.to_datetime(df[column], errors='coerce')
    return df

In [6]:
def add_columns_to_test(test):  
    test['datetime_date'] = test['prediction_datetime'].dt.date
    test['datetime_date'] = pd.to_datetime(test['datetime_date'])
    test['hour'] = test['prediction_datetime'].dt.time
    test['hour'] = pd.to_datetime(test['hour'], format='%H:%M:%S').dt.hour
    test['hour'] = test['hour'].astype('int64')
    test['year'] = test['prediction_datetime'].dt.year
    test['month'] = test['prediction_datetime'].dt.month    
    test['week_num'] = test['prediction_datetime'].dt.isocalendar().week
    test['week_num'] = test['week_num'].astype('int64')
    test['day'] = test['prediction_datetime'].dt.day
    return test

In [7]:
def add_columns_to_gas(gas):
    gas['mwh_mean'] = gas[['lowest_price_per_mwh', 'highest_price_per_mwh']].mean(axis=1)
    return gas

In [8]:
def add_days_to_dates(df, date_column, days):
    df[date_column] = df[date_column] + pd.Timedelta(days=days)

In [9]:
def generate_code(row):
    return (row['county'] * 7) + (row['product_type'] * 2) + row['is_business']

In [10]:
def handle_outliers(elec):
    median_val = elec['euros_per_mwh'].median()
    
    # Calculate Median Absolute Deviation (MAD)
    mad_val = np.median(np.abs(elec['euros_per_mwh'] - median_val))
    
    # Calculate z-score using MAD
    elec['z_score'] = 0.6745 * ((elec['euros_per_mwh'] - median_val) / mad_val)

    # Define threshold for outliers
    threshold = 6
    
    # Identify outliers based on z-score
    outliers = abs(elec['z_score']) > threshold
    
    # Replace outliers with NaN
    elec.loc[outliers, 'euros_per_mwh'] = np.nan
    
    # Fill NaN values with the previous non-null value (forward fill)
    elec['euros_per_mwh'] = elec['euros_per_mwh'].fillna(method='ffill')

In [11]:
def merge_dataframes(test, elec, gas, client, forecast, calendar):
    # Merge 'elec' DataFrame into 'test'
    test = test.merge(elec, left_on='prediction_datetime', right_on='forecast_date', how='left', suffixes=('', '_elec'))
    
    # Merge 'gas' DataFrame into 'test'
    test = test.merge(gas, left_on='datetime_date', right_on='forecast_date', how='left', suffixes=('', '_gas'))
    
    # Merge specific columns from 'client' DataFrame into 'test'
    test = pd.merge(test, client[['date', 'code', 'eic_count', 'installed_capacity']],
                    left_on=['datetime_date', 'code'],
                    right_on=['date', 'code'],
                    how='left',
                    suffixes=('_test_solar_', '_client'))
   
    # Merge 'forecast' DataFrame into 'test'
    test = pd.merge(test, forecast,
                    left_on=['prediction_datetime', 'county'],
                    right_on=['forecast_datetime', 'county_code'],
                    how='left',
                    suffixes=('_forecast_weather_', '_forecast_weather2_'))
    
    # Merge specific columns from 'calendar' DataFrame into 'test'
    test = pd.merge(test, calendar[['date', 'working_day']], left_on='datetime_date', right_on='date', how='left')
    
    return test

In [12]:
def add_model_column(test, code_list):
    # Load the list of codes from the CSV file
    cd_list = code_list['code'].to_list()

    # Iterate over rows in the 'test' DataFrame
    for index, row in test.iterrows():
        # Check if 'is_consumption' is 0
        if row['is_consumption'] == 0:
            test.at[index, 'model'] = 'catboost_model.pkl'
        else:
            # Check if 'code' is in the code list
            if row['code'] in cd_list:
                test.at[index, 'model'] = f'model_{row["code"]}.pkl'

    return test

In [13]:
def preprocess_test(test):
    # Columns to keep
    cols_to_keep = ['row_id', 'prediction_datetime', 'hour', 'week_num', 'working_day',
                    'county', 'is_business', 'product_type', 'is_consumption', 'code',
                    'prediction_unit_id', 'euros_per_mwh', 'mwh_mean', 'eic_count',
                    'installed_capacity', 'temperature', 'dewpoint', 'snowfall',
                    'cloudcover_total', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_high',
                    'surface_solar_radiation_downwards', 'year', 'month', 'day', 'model']
    
    # Filter the DataFrame to keep only the specified columns
    test = test[cols_to_keep]
    
    # Convert columns to the required data types
    test['eic_count'] = test['eic_count'].astype(float)
    test['hour'] = test['hour'].astype('int64')
    test['row_id'] = test['row_id'].astype('int64')
    test['week_num'] = test['week_num'].astype('int64')
    test['working_day'] = test['working_day'].astype('int64')
    test['county'] = test['county'].astype('int64')
    test['is_business'] = test['is_business'].astype('int64')
    test['product_type'] = test['product_type'].astype('int64')
    test['is_consumption'] = test['is_consumption'].astype('int64')
    test['code'] = test['code'].astype('int64')
    test['prediction_unit_id'] = test['prediction_unit_id'].astype('int64')
    test['year'] = test['year'].astype('int64')
    test['month'] = test['month'].astype('int64')
    test['day'] = test['day'].astype('int64')
            
    return test

In [14]:
def make_predictions(test):
    # Load the CatBoost model
    model_path_lgb = '/kaggle/input/catboost-model/catboost_model.pkl'
    with open(model_path_lgb, 'rb') as model_file:
        catboost_model = pickle.load(model_file)
        
    sample_prediction = pd.DataFrame(columns=['row_id', 'target'])
    
    predictions = []

    # Iterate over rows in the 'test' DataFrame
    for index, row in test.iterrows():
        # Check the value in 'model' column
        if row['model'] == 'catboost_model.pkl':
            # Use CatBoost prediction
            data = row.drop(['row_id', 'prediction_datetime', 'model', 'is_consumption', 'is_business', 'prediction_unit_id'])
            prediction = catboost_model.predict(data.values.reshape(1, -1))[0]  # Convert data to numpy array and reshape
        else:
            # Load the Gradient Boosting model indicated in 'model' column
            model_file_name = row['model']
            model_file_path = os.path.join('/kaggle/input/models/', model_file_name)
            with open(model_file_path, 'rb') as model_file:
                gradient_boosting_model = pickle.load(model_file)

            # Prepare the features for Gradient Boosting model
            data = np.array([row['temperature'], row['working_day'], row['hour']]).reshape(1, -1)

            # Make predictions using Gradient Boosting model
            prediction = gradient_boosting_model.predict(data)[0]  # Assuming prediction is a single value

        # Append the prediction along with its row_id to the predictions list
        predictions.append({'row_id': int(row['row_id']), 'target': prediction})

    # Create a DataFrame from the predictions list
    sample_prediction = pd.DataFrame(predictions)

    return sample_prediction  

In [15]:
env = enefit.make_env()
iter_test = env.iter_test()
for (test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
        
        # Load additional data
        calendar, gps_point, code_list = load_data()
        
        # Rename historical_weather to hist
        hist = historical_weather.copy()

        # Rename forecast_weather to forecast
        forecast = forecast_weather.copy()

        # Rename electricity_prices to elec
        elec = electricity_prices.copy()

        # Rename gas_prices to gas
        gas = gas_prices.copy()
        
        # Preprocess forecast data
        forecast = preprocess_forecast(forecast_weather, gps_point)
                
        # Apply date conversion to dataframes
        client = convert_dates(client)
        forecast = convert_dates(forecast)
        historical_weather = convert_dates(historical_weather)
        gas_prices = convert_dates(gas_prices)
        electricity_prices = convert_dates(electricity_prices)
        test = convert_dates(test)
        
        # Convert calendar and gps_point dates to datetime
        calendar['date'] = pd.to_datetime(calendar['date'], errors='coerce')
                
        # Add date time related columns to test
        add_columns_to_test(test)
        
        # Add mean mwh to gas
        add_columns_to_gas(gas_prices)
        
        # Add code column to test and client
        client['code'] = client.apply(generate_code, axis=1)
        test['code'] = test.apply(generate_code, axis=1)
        
        # Move calendar dates of client, gas, elec
        add_days_to_dates(client, 'date', 2)
        add_days_to_dates(gas_prices, 'forecast_date', 1)
        add_days_to_dates(electricity_prices, 'forecast_date', 1)
        
        # Resolve outliers in electricity_prices
        handle_outliers(electricity_prices)
        
        # Merge dataframes
        test = merge_dataframes(test, electricity_prices, gas_prices, client, forecast, calendar)
        
        # Add the 'model' column to the test dataframe
        add_model_column(test, code_list)
        
        # Additional preprocessing of test
        test = preprocess_test(test)
                
        # Make predictions
        sample_prediction = make_predictions(test)
        
        # Submit predictions
        env.predict(sample_prediction)
        
print("All processing completed.")

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
All processing completed.
