In [10]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import os
import sys
# Add `src` directory to the path.
sys.path.append('../src')

# import  utils model fr
from utils import *
from sqlalchemy import create_engine

### Add default variable

In [11]:
# Define the list of codes representing green energy
GREEN_ENERGY = [
    "B01", "B09", "B10", "B11", "B12",
    "B13", "B15", "B16", "B18", "B19"
]
folder_path = '../data/ingestion_data'
# Initialize an empty DataFrame to hold concatenated data
concatenated_load_df = pd.DataFrame()
concatenated_gen_df = pd.DataFrame()
# Get file name by prefix 
load_all_files = list_files_containing_char(folder_path, 'load')
gen_all_files = list_files_containing_char(folder_path, 'gen')

### Load data from ingestion source 
1. Gen data (green energy produce ) -> Extract only green data based on green_tag
2. Load data  (consumption)

In [12]:
# Loop through the gen files and concatenate only green energy file
for file_name in gen_all_files:
    if file_name.endswith('.csv') and any(code in file_name for code in GREEN_ENERGY):
        # Read the csv file
        df = pd.read_csv(os.path.join(folder_path, file_name))
        
        # Split the file name and create new columns
        split_name = file_name.replace('.csv', '').split('_')
        df['Country'] = split_name[1]
        
        # Concatenate the DataFrame to the main concatenated_df
        concatenated_gen_df = pd.concat([concatenated_gen_df, df])
        
# Delete AreaID because already have country code
concatenated_gen_df = concatenated_gen_df.drop(columns=['AreaID'])

# Loop through the load files and concatenate
for file_name in load_all_files:
    if file_name.endswith('.csv'):
        # Read the csv file
        df = pd.read_csv(os.path.join(folder_path, file_name))
        
        # Split the file name and create new columns
        split_name = file_name.replace('.csv', '').split('_')
        df['Country'] = split_name[1]
        
        # Concatenate the DataFrame to the main concatenated_df
        concatenated_load_df = pd.concat([concatenated_load_df, df])
        
# Delete AreaID because already have country code
concatenated_load_df = concatenated_load_df.drop(columns=['AreaID'])



# Save the concatenated DataFrame to a new csv file
# output_file_path = os.path.join(folder_path, 'concatenated_green_energy.csv')
# concatenated_df.to_csv(output_file_path, index=False)

# concatenated_df

In [13]:
concatenated_gen_df

Unnamed: 0,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2021-12-31T23:45+00:00Z,2022-01-01T00:00+00:00Z,MAW,B09,26,DE
1,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
2,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
3,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
4,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...
70075,2022-12-31T22:30+00:00Z,2022-12-31T22:45+00:00Z,MAW,B10,3809,DE
70076,2022-12-31T22:45+00:00Z,2022-12-31T23:00+00:00Z,MAW,B10,1699,DE
70077,2022-12-31T23:00+00:00Z,2022-12-31T23:15+00:00Z,MAW,B10,1774,DE
70078,2022-12-31T23:15+00:00Z,2022-12-31T23:30+00:00Z,MAW,B10,2185,DE


In [14]:
concatenated_load_df

Unnamed: 0,StartTime,EndTime,UnitName,Load,Country
0,2021-12-31T23:00+00:00Z,2022-01-01T00:00+00:00Z,MAW,20827,SP
1,2022-01-01T00:00+00:00Z,2022-01-01T01:00+00:00Z,MAW,19530,SP
2,2022-01-01T01:00+00:00Z,2022-01-01T02:00+00:00Z,MAW,18383,SP
3,2022-01-01T02:00+00:00Z,2022-01-01T03:00+00:00Z,MAW,17680,SP
4,2022-01-01T03:00+00:00Z,2022-01-01T04:00+00:00Z,MAW,17396,SP
...,...,...,...,...,...
35035,2022-12-31T22:30+00:00Z,2022-12-31T22:45+00:00Z,MAW,4071,HU
35036,2022-12-31T22:45+00:00Z,2022-12-31T23:00+00:00Z,MAW,3961,HU
35037,2022-12-31T23:00+00:00Z,2022-12-31T23:15+00:00Z,MAW,3938,HU
35038,2022-12-31T23:15+00:00Z,2022-12-31T23:30+00:00Z,MAW,3929,HU


### Using SQL Lite to discover data easily
1. Import gen_data into energy_data_gen table
2. Import load_data into energy_data_load table

In [15]:
engine = create_engine('sqlite://', echo=False)

In [16]:
concatenated_gen_df.to_sql('energy_data_gen', con=engine, index=False)

1466395

In [17]:
concatenated_load_df.to_sql('energy_data_load', con=engine, index=False)

174330

#### Testing energy_data_gen table and playground with dataset

In [18]:
select_all_energy_data_gen = load_query('Select_all_energy_data_gen')
select_all_energy_data_gen
#pd.set_option('display.max_rows', 100)

'with data_raw as (\n  SELECT\n  substr(StartTime, 1, 10) as StartDate,\n  substr(EndTime, 1, 10) as EndDate,\n  substr(StartTime, 12, 2) as dataHour,\n  *\nFROM energy_data_gen\n)\n\nSELECT * FROM data_raw;'

#### Summarize data 
Even StartTime and EndTime include 4 timeframes (15 minutes each), but it's still in the same hour. -> Sum by dataHour

In [19]:
pd.read_sql_query(select_all_energy_data_gen, con=engine)

Unnamed: 0,StartDate,EndDate,dataHour,StartTime,EndTime,UnitName,PsrType,quantity,Country
0,2021-12-31,2022-01-01,23,2021-12-31T23:45+00:00Z,2022-01-01T00:00+00:00Z,MAW,B09,26,DE
1,2022-01-01,2022-01-01,00,2022-01-01T00:00+00:00Z,2022-01-01T00:15+00:00Z,MAW,B09,26,DE
2,2022-01-01,2022-01-01,00,2022-01-01T00:15+00:00Z,2022-01-01T00:30+00:00Z,MAW,B09,26,DE
3,2022-01-01,2022-01-01,00,2022-01-01T00:30+00:00Z,2022-01-01T00:45+00:00Z,MAW,B09,26,DE
4,2022-01-01,2022-01-01,00,2022-01-01T00:45+00:00Z,2022-01-01T01:00+00:00Z,MAW,B09,26,DE
...,...,...,...,...,...,...,...,...,...
1466390,2022-12-31,2022-12-31,22,2022-12-31T22:30+00:00Z,2022-12-31T22:45+00:00Z,MAW,B10,3809,DE
1466391,2022-12-31,2022-12-31,22,2022-12-31T22:45+00:00Z,2022-12-31T23:00+00:00Z,MAW,B10,1699,DE
1466392,2022-12-31,2022-12-31,23,2022-12-31T23:00+00:00Z,2022-12-31T23:15+00:00Z,MAW,B10,1774,DE
1466393,2022-12-31,2022-12-31,23,2022-12-31T23:15+00:00Z,2022-12-31T23:30+00:00Z,MAW,B10,2185,DE


#### Testing energy_data_load table and playground with dataset

In [22]:
Transform_load_data = load_query('Transform_load_data')
Transform_load_data

"with data_raw as (\n  SELECT\n  substr(StartTime, 1, 10) as StartDate,\n  substr(EndTime, 1, 10) as EndDate,\n  substr(StartTime, 12, 2) as dataHour,\n  *\nFROM energy_data_load\nWHERE StartDate <> '2021-12-31')\n\n  SELECT StartDate,EndDate,dataHour,\n  SUM(CASE WHEN Country = 'HU' THEN Load ELSE 0 END) AS load_HU,\n  SUM(CASE WHEN Country = 'IT' THEN Load ELSE 0 END) AS load_IT,\n  SUM(CASE WHEN Country = 'PO' THEN Load ELSE 0 END) AS load_PO,\n  SUM(CASE WHEN Country = 'SP' THEN Load ELSE 0 END) AS load_SP,\n  SUM(CASE WHEN Country = 'UK' THEN Load ELSE 0 END) AS load_UK,\n  SUM(CASE WHEN Country = 'DE' THEN Load ELSE 0 END) AS load_DE,\n  SUM(CASE WHEN Country = 'DK' THEN Load ELSE 0 END) AS load_DK,\n  SUM(CASE WHEN Country = 'SE' THEN Load ELSE 0 END) AS load_SE,\n  SUM(CASE WHEN Country = 'NE' THEN Load ELSE 0 END) AS load_NE\nFROM  data_raw\nGROUP BY 1,2,3;"

In [23]:
pd.read_sql_query(Transform_load_data, con=engine)

Unnamed: 0,StartDate,EndDate,dataHour,load_HU,load_IT,load_PO,load_SP,load_UK,load_DE,load_DK,load_SE,load_NE
0,2022-01-01,2022-01-01,00,16457,19756,13935,19530,1244,165125,3218,15331,40706
1,2022-01-01,2022-01-01,01,15426,18685,13579,18383,1131,160415,3126,15270,39465
2,2022-01-01,2022-01-01,02,14781,18124,13397,17680,1091,158035,3080,15150,38923
3,2022-01-01,2022-01-01,03,14630,18400,13364,17396,969,157016,3044,15387,38211
4,2022-01-01,2022-01-01,04,14688,19223,13449,17544,896,154552,3130,15737,38146
...,...,...,...,...,...,...,...,...,...,...,...,...
9119,2022-12-31,2022-12-31,19,17488,24809,15211,104484,0,184106,3713,14440,45646
9120,2022-12-31,2022-12-31,20,17075,23169,14641,97124,0,175401,3579,14201,43425
9121,2022-12-31,2022-12-31,21,16717,21857,13977,87316,0,168830,3520,13713,41766
9122,2022-12-31,2022-12-31,22,16261,20555,13272,82064,0,161541,3440,13772,40415


In [26]:

Transform_join_load_gen = load_query('Transform_join_load_gen')
Transform_join_load_gen

"with data_raw_load as (\nSELECT\n  substr(StartTime, 1, 10) as StartDate,\n  substr(EndTime, 1, 10) as EndDate,\n  substr(StartTime, 12, 2) as dataHour,\n  *\nFROM energy_data_load\nWHERE StartDate <> '2021-12-31'),\n\ndata_raw_gen as (\nSELECT\n  substr(StartTime, 1, 10) as StartDate,\n  substr(EndTime, 1, 10) as EndDate,\n  substr(StartTime, 12, 2) as dataHour,\n  *\nFROM energy_data_gen\nWHERE StartDate <> '2021-12-31'),\n\nagg_data_load as (\nSELECT StartDate,dataHour,\n  SUM(CASE WHEN Country = 'HU' THEN Load ELSE 0 END) AS load_HU,\n  SUM(CASE WHEN Country = 'IT' THEN Load ELSE 0 END) AS load_IT,\n  SUM(CASE WHEN Country = 'PO' THEN Load ELSE 0 END) AS load_PO,\n  SUM(CASE WHEN Country = 'SP' THEN Load ELSE 0 END) AS load_SP,\n  SUM(CASE WHEN Country = 'UK' THEN Load ELSE 0 END) AS load_UK,\n  SUM(CASE WHEN Country = 'DE' THEN Load ELSE 0 END) AS load_DE,\n  SUM(CASE WHEN Country = 'DK' THEN Load ELSE 0 END) AS load_DK,\n  SUM(CASE WHEN Country = 'SE' THEN Load ELSE 0 END) AS lo

In [27]:
consolidated_data = pd.read_sql_query(Transform_join_load_gen, con=engine)
consolidated_data

Unnamed: 0,StartDate,dataHour,green_energy_HU,green_energy_IT,green_energy_PO,green_energy_SP,green_energy_UK,green_energy_DE,green_energy_DK,green_energy_SE,green_energy_NE,load_HU,load_IT,load_PO,load_SP,load_UK,load_DE,load_DK,load_SE,load_NE
0,2022-01-01,00,1376,5745,4491,10827,0,157373,3605,11107,12768,16457,19756,13935,19530,1244,165125,3218,15331,40706
1,2022-01-01,01,1526,6228,4436,11140,0,151747,3309,11036,11131,15426,18685,13579,18383,1131,160415,3126,15270,39465
2,2022-01-01,02,1560,6407,4568,11361,0,147938,3043,10509,11324,14781,18124,13397,17680,1091,158035,3080,15150,38923
3,2022-01-01,03,1287,5760,4559,11247,0,141816,2852,10770,11290,14630,18400,13364,17396,969,157016,3044,15387,38211
4,2022-01-01,04,1309,5501,4374,10868,0,131970,2724,10852,11093,14688,19223,13449,17544,896,154552,3130,15737,38146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-12-31,19,851,4198,6959,67056,824,175631,1765,10179,17241,17488,24809,15211,104484,0,184106,3713,14440,45646
8756,2022-12-31,20,888,3676,6910,62432,830,176499,1593,9586,17233,17075,23169,14641,97124,0,175401,3579,14201,43425
8757,2022-12-31,21,807,3641,7054,61096,676,175678,1788,9148,17129,16717,21857,13977,87316,0,168830,3520,13713,41766
8758,2022-12-31,22,754,3422,5946,55252,489,165104,2242,8806,15939,16261,20555,13272,82064,0,161541,3440,13772,40415


In [18]:
consolidated_data.to_sql('consolidated_data', con=engine, index=False)

9124

In [30]:

consolidated_data['index'] = (consolidated_data['StartDate'] + '-' + consolidated_data['dataHour']).str.replace('-', '').astype(int)

In [31]:
consolidated_data

Unnamed: 0,StartDate,dataHour,green_energy_HU,green_energy_IT,green_energy_PO,green_energy_SP,green_energy_UK,green_energy_DE,green_energy_DK,green_energy_SE,...,load_HU,load_IT,load_PO,load_SP,load_UK,load_DE,load_DK,load_SE,load_NE,index
0,2022-01-01,00,1376,5745,4491,10827,0,157373,3605,11107,...,16457,19756,13935,19530,1244,165125,3218,15331,40706,2022010100
1,2022-01-01,01,1526,6228,4436,11140,0,151747,3309,11036,...,15426,18685,13579,18383,1131,160415,3126,15270,39465,2022010101
2,2022-01-01,02,1560,6407,4568,11361,0,147938,3043,10509,...,14781,18124,13397,17680,1091,158035,3080,15150,38923,2022010102
3,2022-01-01,03,1287,5760,4559,11247,0,141816,2852,10770,...,14630,18400,13364,17396,969,157016,3044,15387,38211,2022010103
4,2022-01-01,04,1309,5501,4374,10868,0,131970,2724,10852,...,14688,19223,13449,17544,896,154552,3130,15737,38146,2022010104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2022-12-31,19,851,4198,6959,67056,824,175631,1765,10179,...,17488,24809,15211,104484,0,184106,3713,14440,45646,2022123119
8756,2022-12-31,20,888,3676,6910,62432,830,176499,1593,9586,...,17075,23169,14641,97124,0,175401,3579,14201,43425,2022123120
8757,2022-12-31,21,807,3641,7054,61096,676,175678,1788,9148,...,16717,21857,13977,87316,0,168830,3520,13713,41766,2022123121
8758,2022-12-31,22,754,3422,5946,55252,489,165104,2242,8806,...,16261,20555,13272,82064,0,161541,3440,13772,40415,2022123122


In [33]:
nan_count = consolidated_data.isnull().sum().to_frame('nan_count').reset_index()
nan_count

Unnamed: 0,index,nan_count
0,StartDate,0
1,dataHour,0
2,green_energy_HU,0
3,green_energy_IT,0
4,green_energy_PO,0
5,green_energy_SP,0
6,green_energy_UK,0
7,green_energy_DE,0
8,green_energy_DK,0
9,green_energy_SE,0
