In [91]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import pyarrow.parquet as pq 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [22]:
def get_file_path(filename, starting_directory='.'):
    paths = []
    for root, directories, files in os.walk(starting_directory):
        for file in files:
            if file == filename:
                paths.append(os.path.join(root, file))
                return paths
    raise FileNotFoundError(f"Could not find file {filename} in directory {starting_directory}")
get_file_path('X_train_observed.parquet')

['./A/X_train_observed.parquet']

# Data Extraction & Exploration

In [None]:
# Load the metadata of the parquet files
metadata_a = pq.read_metadata('A/X_train_observed.parquet')
metadata_b = pq.read_metadata('B/X_train_observed.parquet')
metadata_c = pq.read_metadata('C/X_train_observed.parquet')

get_file_path

# Get the schema of the parquet files
schema_a = metadata_a.schema
schema_b = metadata_b.schema
schema_c = metadata_c.schema

print("Schema for file A:")
print(schema_a)
print("\nSchema for file B:")
print(schema_b)
print("\nSchema for file C:")
print(schema_c)

In [46]:
# Load the data
df_A = pq.read_table('A/X_train_observed.parquet').to_pandas()
df_B = pq.read_table('B/X_train_observed.parquet').to_pandas()
df_C = pq.read_table('C/X_train_observed.parquet').to_pandas()
df_B_targets = pq.read_table('B/train_targets.parquet').to_pandas()
df_B_targets = df_B_targets.rename(columns={'time': 'date_forecast'})

# Inspect the data
#print(df_A.info())
print(df_B_targets.info())
#print(df_C.info())
column_names = df_B.columns.tolist()
print(column_names)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32848 entries, 0 to 32847
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date_forecast   32848 non-null  datetime64[us]
 1   pv_measurement  32844 non-null  float64       
dtypes: datetime64[us](1), float64(1)
memory usage: 513.4 KB
None
['date_forecast', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J', 'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx', 'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_p

# DATA PREPROCESSING

In [88]:
# TODO:
# - Create method for joining target dataset with features dataset 
# - Create method for creating horizon dataframes
# - identify which features to use for lag features 

# Class for general feature processing
class FeatureProcessingClass():
    def __init__(self):

        ###--- DATA SPECIFIC CLASS VARIABLES---###

        # all features 
        self.features = ['date_forecast', 
                          'absolute_humidity_2m:gm3', 
                          'air_density_2m:kgm3', 
                          'ceiling_height_agl:m', 
                          'clear_sky_energy_1h:J', 
                          'clear_sky_rad:W', 
                          'cloud_base_agl:m', 
                          'dew_or_rime:idx', 
                          'dew_point_2m:K', 
                          'diffuse_rad:W', 
                          'diffuse_rad_1h:J', 
                          'direct_rad:W', 
                          'direct_rad_1h:J', 
                          'effective_cloud_cover:p', 
                          'elevation:m', 
                          'fresh_snow_12h:cm', 
                          'fresh_snow_1h:cm', 
                          'fresh_snow_24h:cm', 
                          'fresh_snow_3h:cm', 
                          'fresh_snow_6h:cm', 
                          'is_day:idx', 
                          'is_in_shadow:idx', 
                          'msl_pressure:hPa', 
                          'precip_5min:mm', 
                          'precip_type_5min:idx', 
                          'pressure_100m:hPa', 
                          'pressure_50m:hPa', 
                          'prob_rime:p', 
                          'rain_water:kgm2', 
                          'relative_humidity_1000hPa:p', 
                          'sfc_pressure:hPa', 
                          'snow_density:kgm3', 
                          'snow_depth:cm', 
                          'snow_drift:idx', 
                          'snow_melt_10min:mm', 
                          'snow_water:kgm2', 
                          'sun_azimuth:d', 
                          'sun_elevation:d', 
                          'super_cooled_liquid_water:kgm2', 
                          't_1000hPa:K', 
                          'total_cloud_cover:p', 
                          'visibility:m', 
                          'wind_speed_10m:ms', 
                          'wind_speed_u_10m:ms', 
                          'wind_speed_v_10m:ms', 
                          'wind_speed_w_1000hPa:ms']

        # Categorical columns (specify for XGBoost)
        self.categorical_features = ['dew_or_rime:idx',
                                    'is_day:idx', 
                                    'is_in_shadow:idx', 
                                    'precip_type_5min:idx', 
                                    'snow_drift:idx']
        
        # fetaures not suited for lag features 
        self.non_lag_features = ['date_forecast',  'ceiling_height_agl:m', 'elevation:m'] + self.categorical_features

        # features to be replicated with lagged values
        self.lag_features = [feature for feature in self.features if feature not in self.non_lag_features]
        self.lag_values = [1, 12, 24]
        
        # join features
        self.date_forecast = 'date_forecast'
        
        # time features
        self.time_features = ['year', 'quarter', 'month', 'week', 'hour', 'day_of_year', 'day', 'weekday', 'is_weekend']
        
        # target features
        self.target_features = 'pv_measurement'
        
        # Columns of latitude & longitude
        self.lat_lon_columns = ['latitude', 'longitude'] # will not be used for now


    ###--- METHODS FOR JOINING DATASETS ---###


    # method for modifying column names that datasets to be joined have in common
    def add_suffix_to_column_names(self, df: pd.DataFrame, suffix: str, columns_no_change: list[str]): # Needed when we combine with other datasets than the target dataset
        '''
        Change column names by given suffix, keep columns_no_change, and return back the data

        PARAMS:
        - df: data
        - suffix: suffixes to add to column names
        - columns_no_change: list of column names who should not be changed
        '''
        df.columns = [col + suffix 
                      if col not in columns_no_change
                      else col
                      for col in df.columns]
        return df
    

    # generalized method for cropping dataset rows so one or more datasets to be joined match in time
    def crop_datasets(self, df_list: list[pd.DataFrame]): # mabye not needed since we are using inner join in join_datasets function
        '''
        Crop datasets so they match in time

        PARAMS:
        - df_list: list of dataframes to be cropped
        RETURNS:
        - df_cropped: list of cropped dataframes with matching time intervals
        '''
        # Find the first and last date in all datasets 
        min_dates = [df['date_forecast'].min() for df in df_list] 
        max_dates = [df['date_forecast'].max() for df in df_list] 
        interval = [max(min_dates), min(max_dates)]
        # Crop the datasets
        df_cropped = [df[(df['date_forecast'] >= interval[0]) & (df['date_forecast'] <= interval[1])] for df in df_list]
        return df_cropped
        
    
    # method for joining target dataset with features dataset
    def join_datasets(self, df_list: list[pd.DataFrame], on='date_forecast'):
        '''
        Join dataset in list on 'date_forecast' column

        PARAMS:
        - df: data
        - target_df: target data
        RETURNS:
        - df: joined data
        '''
        # Check if all datasets have the same 'date_forecast' column
        if not all([on in df.columns for df in df_list]):
            raise ValueError('Not all datasets have'+ on +'column') # changed from f-string to string
        
        # Join datasets
        df = df_list[0]
        for i in range(1, len(df_list)):
            df = df.merge(df_list[i], on=on, how='left') # changed from inner to left join
        return df
    

    ###--- METHODS FOR MODIFYING FEATURES ---###


    def impute_missing_values(self, df: pd.DataFrame, time_column='date_forecast'):
        '''
        Impute missing values with mean of the two closest non-NaN values

        PARAMS:
        - df: pandas data
        '''
        # Create a KNNImputer
        imputer = KNNImputer(n_neighbors=2)

        # Save the 'date_forecast' column and remove it from the DataFrame
        date_forecast = df[time_column]
        df = df.drop(columns=[time_column])

        # Fit and transform the DataFrame
        df_imputed = imputer.fit_transform(df)
        df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

        # Add the 'date_forecast' column back to the DataFrame
        df_imputed[time_column] = date_forecast

        return df_imputed
    

    ###--- METHODS FOR CREATING NEW FEATURES ---###

    
    def create_time_features(self, df: pd.DataFrame):
        '''
        Create data features based on datetime column

        PARAMS:
        - df: data
        RETURNS:
        - df: data with time features
        '''

        # Check if 'date_forecast' column exists
        if 'date_forecast' not in df.columns.tolist():
            raise ValueError("DataFrame does not have 'date_forecast' column")

        # Try to convert 'date_forecast' to datetime
        try:
            df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        except Exception as e:
            raise ValueError("Cannot convert 'date_forecast' to datetime: " + str(e))

        # time period features
        df['date'] = df['date_forecast'].dt.normalize() # mabye not needed
        df['year'] = df['date_forecast'].dt.year
        df['quarter'] = df['date_forecast'].dt.quarter
        df['month'] = df['date_forecast'].dt.month
        df['week'] = df['date_forecast'].dt.isocalendar().week
        df['hour'] = df['date_forecast'].dt.hour

        # day features
        df['day_of_year'] = df['date_forecast'].dt.day_of_year # mabye not needed
        df['day'] = df['date_forecast'].dt.day # mabye not needed
        df['weekday'] = df['date_forecast'].dt.weekday

        # boolean features
        df['is_weekend'] = df['weekday'] > 5

        return df
    

    def create_lag_features(self, df: pd.DataFrame, lag_features: list[str], lag_values: list[int]):
        '''
        Create lag features for given columns

        PARAMS:
        - df: data
        - lag_features: list of columns to create lag features for
        - lag_values: list of lag values
        RETURNS:
        - df: data with lagged features
        '''
        # checking that not any lag features is categorical
        if any(feature in lag_features for feature in self.categorical_features):
            catagorical_feature_list = [feature for feature in lag_features if feature in self.categorical_features]
            raise ValueError(f"Cannot create lag features for categorical features {catagorical_feature_list}")

        for feature in lag_features:
            for lag in lag_values:
                df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
        return df
    

    ###--- METHODS FOR DATA CONVERSION/MANIPULATION ---###


    def convert_to_categorical(self, df: pd.DataFrame, categorical_features: list[str]):
        '''
        Convert columns to categorical dtype

        PARAMS:
        - df: data
        - catagorical_features: list of columns to convert to categorical
        '''
        return  df[categorical_features].apply(lambda x: x.astype('category'))
    

    def convert_to_numerical(self, df: pd.DataFrame, numerical_features: list[str]):
        '''
        Convert columns to numerical dtype
        
        PARAMS:
        - df: data
        - numerical_features: list of columns to convert to numerical
        '''
        return  df[numerical_features].apply(lambda x: x.astype('float'))
    

    ###--- METHODS FOR CREATING SHIFTED DATAFRAME FOR EACH FORECASTED HOUR ---###


    def create_horizon_dataframes(self, df: pd.DataFrame, horizon = 24):
        '''
        Create a shifted dataframe for each forecasted hour

        PARAMS:
        - df: data
        - horizon: number of timesteps forecasted
        '''
        # dictionary to store dataframes - Key: forecasted hour, Value: shifted dataframe
        dataframes = {}
        for h in range(1, horizon + 1):
            df_h = df.copy()
            df_h[f'{self.target_features}_horizon_{h}'] = df_h[self.target_features].shift(-h)
            for feature in self.time_features:
                df_h[feature] = df_h[feature].shift(-h)
            df_h = df_h.dropna()
            dataframes[h] = df_h.drop(columns=[self.target_features])
    
        return dataframes
        

    ###--- CALL METHOD FOR INITIALISING CLASS AS FUNCTION ---###


    def __call__(self, data: list[pd.DataFrame]):
        '''
        Processing of features from all datasets 
        PARAMS: 
        - data: list of processed dataframes containg all datasets that are to be joined and preprocessed
        RETURNS:
        - df: List of 24 dataframes with features for each forecasted hour
        '''
        # joining datasets
        data = self.join_datasets(data)
        # Impute missing values
        data = self.impute_missing_values(data)
        # Create features
        data = self.create_lag_features(data, self.lag_features, self.lag_values)
        data = self.create_time_features(data)
        # Change columns to categorical for XGBoost
        data[self.categorical_features] = data[self.categorical_features].astype('category')
        # Create horizon dataframes
        data = self.create_horizon_dataframes(data)
        return data

# MODEL IMPLEMENTATION

In [None]:
class xgboost_forecasting():
    def __init__(self):
        self.model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=10, subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42) 

    def fitmod(self, data: list[pd.DataFrame]):
        pass 

    def __call__(self, data: list[pd.DataFrame]):
        pass

# MODEL FITTING

In [90]:
FeatureProcessor = FeatureProcessingClass()

data = FeatureProcessor([df_B, df_B_targets])

  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df[feature + '_lag_' + str(lag)] = df[feature].shift(lag)
  df['date'] = df['date_forecast'].dt.no