# TFM - OMIE DATAFRAME SELECTION

## 1. INTRODUCTION

The aim of this Notebook is to create dataframes for a certain unit using the monthly OMIE data stored locally (explained in TFM_merged_files.ipynb). 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. DATAFRAME SELECTION FUNCTION

In this setion, a function to select the information from the local .csv files is presented.

In [2]:
#Monthly files stored locally
cab_det_path = '/home/dsc/Documents/TFM/Data/OMIE/CAB_DET/'

In [5]:
#One year of data: from November 2019 to October 2020.
cab_det_list = !ls -1 $cab_det_path
cab_det_list

['OMIE_012020.csv',
 'OMIE_022020.csv',
 'OMIE_032020.csv',
 'OMIE_042020.csv',
 'OMIE_052020.csv',
 'OMIE_062020.csv',
 'OMIE_072020.csv',
 'OMIE_082020.csv',
 'OMIE_092020.csv',
 'OMIE_102020.csv',
 'OMIE_112019.csv',
 'OMIE_122019.csv']

First of all, September 2020 is selected as example to see how the files are read.

In [6]:
#A new column "Unnamed: 0" is created
df_OMIE_092020 = pd.read_csv(cab_det_path + 'OMIE_092020.csv')

In [7]:

df_OMIE_092020.head()

Unnamed: 0.1,Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
0,0,1696149,6,EDPC2,EDP COMERCIAL COMPRA (PORT),CNO,6000.0,2020,9,1,22,1,0.01,0.1
1,1,1717319,3,EONUC01,EONUR CONSUMO CLIENTES TUR,CNO,400.0,2020,9,1,1,1,0.0,1.0
2,2,1717319,3,EONUC01,EONUR CONSUMO CLIENTES TUR,CNO,400.0,2020,9,1,2,1,0.0,1.0
3,3,1717319,3,EONUC01,EONUR CONSUMO CLIENTES TUR,CNO,400.0,2020,9,1,3,1,0.0,1.0
4,4,1717319,3,EONUC01,EONUR CONSUMO CLIENTES TUR,CNO,400.0,2020,9,1,4,1,0.0,1.0


In [8]:
df_OMIE_092020.shape

(1478829, 14)

In [10]:
#Information from Sell/Buy distribution
df_OMIE_092020['Sell_Buy'].value_counts()

VNO    1072350
CNO     405759
VNP        720
Name: Sell_Buy, dtype: int64

In [9]:
#Example with PALOS1 for the whole set of data
unit = 'PALOS1'

df_OMIE_unit = pd.DataFrame()

for archive in cab_det_list: #Reading all the files in the local OMIE "datalake"
    df_OMIE_month = pd.read_csv(cab_det_path + archive) #Reding file
    df_OMIE_unit_month = df_OMIE_month.loc[df_OMIE_month['Bid_Unit'].str.startswith(unit)] #Selectin the unit
    df_OMIE_unit = pd.concat([df_OMIE_unit,df_OMIE_unit_month]) #Concatenate the monthly info

In [10]:
df_OMIE_unit.head()

Unnamed: 0.1,Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
26739,26739,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,1,12,180.3,394.1
26740,26740,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,2,12,180.3,394.1
26741,26741,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,3,1,1.13,50.0
26742,26742,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,3,12,180.3,344.1
26743,26743,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,4,1,1.13,60.0


In [11]:
type(df_OMIE_unit)

pandas.core.frame.DataFrame

In [12]:
df_OMIE_unit.shape

(71112, 14)

In [14]:
#Data from each year
df_OMIE_unit['Year'].value_counts()

2020    61045
2019    10067
Name: Year, dtype: int64

In [17]:
#Data from each month
df_OMIE_unit['Month'].value_counts()

12    6541
1     6519
10    6181
8     6169
7     6169
5     6169
3     6157
9     5970
6     5970
4     5970
2     5771
11    3526
Name: Month, dtype: int64

In [18]:
#Data from each day
df_OMIE_unit['Day'].value_counts()

25    2433
24    2426
22    2425
21    2425
1     2424
18    2424
4     2424
28    2424
27    2424
26    2424
3     2424
2     2424
23    2417
20    2415
29    2412
19    2370
5     2221
30    2214
9     2213
6     2213
7     2213
8     2213
16    2213
10    2213
11    2213
12    2213
13    2213
14    2213
15    2213
17    2213
31    1406
Name: Day, dtype: int64

Now, a function "df_sel_unit" is created. This function selects one unit by the begining of the "Bid_Unit" code from the OMIE .csv merged files locally stored by month. The function output is a dataframe with the selected info for the whole available period of time.

In [19]:
def df_sel_unit(unit):
    
    cab_det_path = '/home/dsc/Documents/TFM/Data/OMIE/CAB_DET/'
    cab_det_list = !ls -1 $cab_det_path
    
    df_OMIE_unit = pd.DataFrame()

    for archive in cab_det_list:
        df_OMIE_month = pd.read_csv(cab_det_path + archive)
        df_OMIE_unit_month = df_OMIE_month.loc[df_OMIE_month['Bid_Unit'].str.startswith(unit)]
        df_OMIE_unit = pd.concat([df_OMIE_unit,df_OMIE_unit_month])

    df_OMIE_unit = df_OMIE_unit.drop('Unnamed: 0',axis=1).reset_index(drop=True) #Dropping'Unnamed: 0' 
                                                                                 #and reseting the index column
    df_OMIE_unit['Bid_Unit'] = df_OMIE_unit['Bid_Unit'].str.strip() #Eliminating spaces in 'Bid_Unit'
    df_OMIE_unit['Unit_Description'] = df_OMIE_unit['Unit_Description'].str.strip() #Eliminating spaces 
                                                                                    #in 'Bid_Description'  
    
    return df_OMIE_unit

In [20]:
#Using the funtion with "PALOS1"
df_PALOS1 = df_sel_unit('PALOS1')

In [21]:
df_PALOS1.head()

Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
0,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,1,12,180.3,394.1
1,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,2,12,180.3,394.1
2,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,3,1,1.13,50.0
3,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,3,12,180.3,344.1
4,6128191,2,PALOS1,C.C. PALOS 1,VNO,394.1,2020,1,1,4,1,1.13,60.0


In [22]:
df_PALOS1.shape

(71112, 13)

In [23]:
df_PALOS1['Bid_Unit'].str.len()

0        6
1        6
2        6
3        6
4        6
        ..
71107    6
71108    6
71109    6
71110    6
71111    6
Name: Bid_Unit, Length: 71112, dtype: int64

In [24]:
df_PALOS1['Bid_Unit'].str.strip().str.len()

0        6
1        6
2        6
3        6
4        6
        ..
71107    6
71108    6
71109    6
71110    6
71111    6
Name: Bid_Unit, Length: 71112, dtype: int64

In [25]:
df_PALOS1['Unit_Description'].str.len()

0        12
1        12
2        12
3        12
4        12
         ..
71107    12
71108    12
71109    12
71110    12
71111    12
Name: Unit_Description, Length: 71112, dtype: int64

In [121]:
df_PALOS1['Unit_Description'].str.strip().str.len()

0        12
1        12
2        12
3        12
4        12
         ..
71107    12
71108    12
71109    12
71110    12
71111    12
Name: Unit_Description, Length: 71112, dtype: int64

In [26]:
df_PALOS1['Sell_Buy'].str.len()

0        3
1        3
2        3
3        3
4        3
        ..
71107    3
71108    3
71109    3
71110    3
71111    3
Name: Sell_Buy, Length: 71112, dtype: int64

In [27]:
#Checking that "Sell_Buy" is OK
df_PALOS1['Sell_Buy'].str.strip().str.len()

0        3
1        3
2        3
3        3
4        3
        ..
71107    3
71108    3
71109    3
71110    3
71111    3
Name: Sell_Buy, Length: 71112, dtype: int64

In [28]:
#Storing locally information from PALOS1 
df_PALOS1.to_csv('/home/dsc/Documents/TFM/Data/OMIE/OMIE_PALOS1/OMIE_PALOS1_112019_102020.csv')

In [30]:
#Checking types 
df_PALOS1.dtypes

Bid_Code              int64
Num_Version           int64
Bid_Unit             object
Unit_Description     object
Sell_Buy             object
Pot_max             float64
Year                  int64
Month                 int64
Day                   int64
Period                int64
Block                 int64
Price               float64
Energy              float64
dtype: object

In [31]:
df_PALOS1.describe()

Unnamed: 0,Bid_Code,Num_Version,Pot_max,Year,Month,Day,Period,Block,Price,Energy
count,71112.0,71112.0,71112.0,71112.0,71112.0,71112.0,71112.0,71112.0,71112.0,71112.0
mean,6303431.0,1.775157,394.1,2019.858435,6.362217,15.931404,15.879416,6.519223,52.564769,46.995838
std,143155.0,0.865138,4.406533e-10,0.348606,3.433946,8.900143,5.272937,3.635035,48.738841,75.62004
min,6049903.0,1.0,394.1,2019.0,1.0,1.0,1.0,1.0,1.13,3.0
25%,6176254.0,1.0,394.1,2020.0,3.0,8.0,12.0,3.0,35.4,17.0
50%,6300456.0,2.0,394.1,2020.0,6.0,16.0,16.0,6.0,40.72,17.0
75%,6427697.0,2.0,394.1,2020.0,9.0,24.0,20.0,10.0,45.42,17.0
max,6555573.0,5.0,394.1,2020.0,12.0,31.0,25.0,12.0,180.3,394.1


In [32]:
df_PALOS1['Period'].value_counts()

13    4236
19    4236
15    4236
16    4236
17    4236
18    4236
14    4236
20    4236
21    4236
22    4236
23    4236
12    4231
11    4226
10    4226
24    4204
9     2572
8     1104
7      707
6      702
5      700
4      700
3      424
2      354
1      354
25      12
Name: Period, dtype: int64

In [34]:
#Period 25 ocurrs when day time changes
df_PALOS1.loc[(df_PALOS1['Period']==25)]

Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
59839,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,1,1.13,190.0
59840,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,2,38.27,17.0
59841,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,3,39.1,17.0
59842,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,4,39.93,17.0
59843,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,5,40.76,17.0
59844,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,6,42.39,17.0
59845,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,7,43.22,17.0
59846,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,8,44.05,17.0
59847,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,9,45.38,17.0
59848,6546551,1,PALOS1,C.C. PALOS 1,VNO,394.1,2020,10,25,25,10,46.71,17.0


In [35]:
#Using the funtion with "PALOS", to retreive information from PALOS1, 2, and 3.
df_PALOS = df_sel_unit('PALOS')

In [36]:
df_PALOS.to_csv('/home/dsc/Documents/TFM/Data/OMIE/OMIE_PALOS/OMIE_PALOS_112019_102020.csv')

In [37]:
df_PALOS['Bid_Unit'].value_counts()

PALOS1    71112
PALOS3    70179
PALOS2    68034
Name: Bid_Unit, dtype: int64

In [40]:
#Using the funtion with "AGUAYO".
df_AGUAYO = df_sel_unit('AGU')

In [41]:
df_AGUAYO.shape

(82252, 13)

In [42]:
df_AGUAYO.head()

Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
0,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,1,30.0,80.0
1,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,2,48.0,80.0
2,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,3,52.0,80.0
3,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,4,56.0,80.0
4,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,5,180.3,40.6


In [43]:
#Using a cab file to see the relationship between Unit_Code and Unit_Descripton for AGUAYO
df_cab_example = pd.read_csv('/home/dsc/Documents/TFM/Data/OMIE/cab_202009/CAB_09.csv',)

In [44]:
df_cab_example.head()

Unnamed: 0.1,Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day
0,0,1696149,6,EDPC2,EDP COMERCIAL COMPRA (PORT),CNO,6000.0,2020,9,1
1,1,1717319,3,EONUC01,EONUR CONSUMO CLIENTES TUR,CNO,400.0,2020,9,1
2,2,1811311,7,IPG,C.H. IP GENERACION,VNO,84.0,2020,9,1
3,3,426609,12,IPB,C.H.B.IP BOMBEO,CNO,99.0,2020,9,1
4,4,2532852,28,NRENVD1,NRENO-VENTA,VNO,1.7,2020,9,1


In [45]:
df_cab_example.loc[df_cab_example['Unit_Description'].str.contains('AGUAY0')]['Bid_Unit'].value_counts()

Series([], Name: Bid_Unit, dtype: int64)

In [46]:
#Ther are more "Bid_Units" that contains "AGU"
df_cab_example.loc[df_cab_example['Bid_Unit'].str.contains('AGU')]['Bid_Unit'].value_counts()

AGUB       30
SAGU2      30
AGUG       30
SAGU1      30
Name: Bid_Unit, dtype: int64

In [47]:
#It is seen that AGUAYO has two diferent bid units: PUMPING and GENERATION
df_cab_example.loc[df_cab_example['Bid_Unit'].str.startswith('AGU')]['Bid_Unit'].value_counts()

AGUB       30
AGUG       30
Name: Bid_Unit, dtype: int64

In [48]:
df_AGUAYO.head(5)

Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
0,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,1,30.0,80.0
1,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,2,48.0,80.0
2,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,3,52.0,80.0
3,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,4,56.0,80.0
4,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,5,180.3,40.6


In [49]:
#Searching a bit the meaning of block... The sum of the all blocks cannot be higher than the Maximum Power
df_AGUAYO.head(5)['Energy'].cumsum()

0     80.0
1    160.0
2    240.0
3    320.0
4    360.6
Name: Energy, dtype: float64

In [50]:
df_AGUAYO.groupby(['Bid_Unit','Year','Month','Day','Period'])['Energy'].sum()

Bid_Unit  Year  Month  Day  Period
AGUB      2019  11     1    1         360.0
                            2         360.0
                            3         360.0
                            4         360.0
                            5         360.0
                                      ...  
AGUG      2020  10     31   20        360.6
                            21        360.6
                            22        360.6
                            23        360.6
                            24        360.6
Name: Energy, Length: 17568, dtype: float64

In [51]:
df_cab_example.loc[df_cab_example['Unit_Description']
                   .str.contains('AGUAYO')][['Unit_Description','Bid_Unit']].value_counts()

Unit_Description                Bid_Unit
C.H.B. AGUAYO BOMBEO            AGUB        30
C.H. AGUAYO GENERACION          AGUG        30
dtype: int64

In [54]:
#Selecting information from AGUAYO_GEN
df_AGUAYO_GEN = df_AGUAYO[df_AGUAYO['Bid_Unit']=='AGUG']

In [55]:
df_AGUAYO_GEN.shape

(42831, 13)

In [56]:
df_AGUAYO_GEN.head()

Unnamed: 0,Bid_Code,Num_Version,Bid_Unit,Unit_Description,Sell_Buy,Pot_max,Year,Month,Day,Period,Block,Price,Energy
0,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,1,30.0,80.0
1,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,2,48.0,80.0
2,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,3,52.0,80.0
3,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,4,56.0,80.0
4,6129555,1,AGUG,C.H. AGUAYO GENERACION,VNO,360.6,2020,1,1,1,5,180.3,40.6


In [57]:
#Storing info locally
df_AGUAYO_GEN.to_csv('/home/dsc/Documents/TFM/Data/OMIE/OMIE_AGUAYO_GEN/OMIE_AGUAYO_GEN_112019_102020.csv')