## Script for gen files

In [3]:
import os
import pandas as pd
import re
import json
from functools import reduce


# Directory where raw CSV files are stored
directory = "../jupyter_notebook/data_samples"

# Parsing date strings, ignoring any timezone information and converting them to datetime objects
date_parser = lambda x: pd.to_datetime(x[:22])

# List to hold all the dataframes
dataframes = []

# Dictionary to hold all the dataframes
dict_of_dfs = {}

#Types of energy that we considered renewable
energy_type_codes_to_filter = ['B01', 'B09', 'B10', 'B11', 'B12', 'B13', 'B15','B16', 'B18', 'B19']


# Iterate through all files in the directory
for filename in os.listdir(directory):
    
    # Verifies that files we are reading are the ones of the list
    match = re.match(r'gen_[A-Z]{2}_(\w{3})\.csv', filename)

    if match:
        
        # Extract the name of the energy_type
        energy_type = match.group(1)

        #If the energy that we are searching matches with the file it continues
        if energy_type in energy_type_codes_to_filter:
            
            # Read the CSV file
            df = pd.read_csv(os.path.join(directory, filename), converters={'EndTime': date_parser})

            # Extract country and energy type from filename
            _, country, energy_type = filename.split('_')
            energy_type = energy_type.replace('.csv', '') # Remove the file extension

            # Add country and energy type as new columns
            df['CountryCode'] = country
            df['EnergyTypeCode'] = energy_type
            
            #Drop duplicates (duplicates have NaN values in columns)
            df.dropna(inplace=True)

            #EndTime to datetime datatype
            df['EndTime'] = pd.to_datetime(df['EndTime'])

            # Difference between periods.
            df['TimeDifference'] = df['EndTime'].diff().dt.total_seconds() / 60

            # For each datasets consider the smallest value as the period
            sampling_period = int(df.loc[(df['TimeDifference'] > 0) & (df['TimeDifference'] <= 60), 'TimeDifference'].min())

            # Set "EndTime" as index
            df.set_index('EndTime', inplace=True)
            
            #Verifier for 15,30,60 min periods (can be delete)
            sampling_period=str(sampling_period)
            df = df.resample(f'{sampling_period}T').asfreq()
            
            # We make groups for each hour, so we can detect where are missing values
            df['DateHour'] = df.index.floor('H')
            
            # Reset_index
            df.reset_index('EndTime', inplace=True)

            #Group by DateHour and sum to see if the quantity is positive
            grouped_df = df.groupby('DateHour')['quantity'].sum().reset_index()
            
            grouped_df = grouped_df.rename(columns={'quantity': 'HourlySum'})

            df = pd.merge(df, grouped_df, how='left', left_on='DateHour', right_on='DateHour')
            
            # We stay with values that have a group (HoyrlySum!=0) and that are NaN values (quantity NaN)
            df=df[~((df['quantity'].isnull())&(df['HourlySum']==0))]

            df.interpolate(method='linear', limit_direction='both', inplace=True)

            df.set_index('EndTime',inplace=True)

            numeric_cols = df.select_dtypes(include=['number'])
            categorical_cols = df.select_dtypes(exclude=['number', 'datetime64[ns]', 'bool'])

            # Resample the numeric columns and sum
            resampled_df_num = numeric_cols.resample('H').sum()

            # Resample the categorical columns.
            # Here, we take the first value. Adjust the method if needed (e.g., 'last', or a custom function to get the mode)
            resampled_df_cat= categorical_cols.resample('H').last()

            resampled_df = pd.concat([resampled_df_num, resampled_df_cat], axis=1)

            resampled_df.dropna(inplace=True)

            # Append the dataframe to the list
            #dataframes.append(resampled_df)

            dict_of_dfs[f'{country}_{energy_type}']=resampled_df

            # Concatenate all dataframes (if needed)
            #final_df = pd.concat (dataframes)

# Concatenate all dataframes (if needed)
#final_df = pd.concat(dataframes)

# From the EDA (no info given)
dataframes_to_drop = ['SP_B10', 'SE_B13']

for dataframe_name in dataframes_to_drop:
    dict_of_dfs.pop(dataframe_name, None)

#We group by countries.
dic_gen = {}
for name, df in dict_of_dfs.items():
    gen_key = name.split('_')[0]
    if gen_key not in dic_gen:
        dic_gen[gen_key] = {}  # Inicializa un diccionario vacío para esta clave si no existe
    df.rename(columns={'quantity':f'quantity_{name}'},inplace=True)
    dic_gen[gen_key][name] = df[[f'quantity_{name}']]

dict_of_dfs_gen={}

for name,df in dic_gen.items():
    dataframes_inner = list(dic_gen[name].values())
    result_inner = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='inner'), dataframes_inner)
    
    result_inner['quantity_sum'] = result_inner.filter(like='quantity').sum(axis=1)
    
    dict_of_dfs_gen[name]=result_inner[['quantity_sum']]
                


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_inner['quantity_sum'] = result_inner.filter(like='quantity').sum(axis=1)


## Script for load files

In [5]:

# Directory where raw CSV files are stored
directory = "../jupyter_notebook/data_samples"
# Parsing date strings, ignoring any timezone information and converting them to datetime objects
date_parser = lambda x: pd.to_datetime(x[:22])
# List to hold all the dataframes
dict_of_dfs_load = {}

# Iterate through all files in the directory
for filename in os.listdir(directory):

    if re.match(r'load_[A-Z]{2}+\.csv', filename):
        
        # Read the CSV file
        df = pd.read_csv(os.path.join(directory, filename), converters={'EndTime': date_parser}).set_index('EndTime')
        
        _, country = filename.split('_')
        
        country = country.replace('.csv', '') # Remove the file extension
                
        numeric_cols = df.select_dtypes(include=['number'])
        categorical_cols = df.select_dtypes(exclude=['number', 'datetime64[ns]', 'bool'])
        
        # Resample the numeric columns and sum
        resampled_df_num = numeric_cols.resample('H').sum()

        # Resample the categorical columns.
        # Here, we take the first value. Adjust the method if needed (e.g., 'last', or a custom function to get the mode)
        resampled_df_cat= categorical_cols.resample('H').last()
        
        resampled_df = pd.concat([resampled_df_num, resampled_df_cat], axis=1)
        
        dict_of_dfs_load[country]=resampled_df[['Load']]
        

## Surplus calculation

In [40]:
generation=dict_of_dfs_gen.copy()
load=dict_of_dfs_load.copy()

In [41]:
final_dict={}
for name in generation:
    result=pd.merge(generation[name],load[name],left_index=True,right_index=True,how='inner')
    final_dict[name]=result

In [42]:
surplus_per_country={}
for name in final_dict:
    final_dict[name]['surplus']=final_dict[name]['quantity_sum']-final_dict[name]['Load']


# csv

In [64]:
concatenated_df = pd.concat(final_dict.values(), keys=final_dict.keys(), names=['country_code'])

# Reiniciar el índice si es necesario
concatenated_df.reset_index(inplace=True)

In [67]:
concatenated_df.to_csv('surplus_base.csv',index=False)