In [1]:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime

In [None]:
class pe_Client:
    '''
    Object to make sure we get to the web "States" needed to parse the webpage without errors.
    Could not get the XMLHTTP Request from the webpage directly - That would be more straightforward.
    '''
    def __init__(self, root_url, login_page_url, login_action_url):
        self.root_url = root_url
        self.login_page_url = login_page_url
        self.login_action_url = login_action_url
        self.session = requests.Session()
        self.logged_in_state()


    def logged_in_state(self):
        """
        Take self.Session to a logged in state preserving the ASP.NET needed payload: VIEWSTATE and EVENTVALIDATION.
        """        
        #### First HTTP request (GET)
        # Fetch the login page
        self.response = self.session.get(self.login_page_url)
        self.soup_1 = BeautifulSoup(self.response.text, 'html.parser')
        self.viewstate = self.soup_1.find('input', attrs={'name': '__VIEWSTATE'})['value'] 
        self.eventvalidation = self.soup_1.find('input', attrs={'name': '__EVENTVALIDATION'})['value']
        payload = {
            '__VIEWSTATE': self.viewstate,
            '__EVENTVALIDATION': self.eventvalidation,
            'txt_username': "username",
            'txt_password': "password",
            'Button1': 'Login'
        }
        #### Second HTTP request (POST)
        self.response = self.session.post(self.login_page_url, data=payload)
        self.soup_2 = BeautifulSoup(self.response.text, 'html.parser')
        self.viewstate = self.soup_2.find('input', attrs={'name': '__VIEWSTATE'})['value']
        self.eventvalidation = self.soup_2.find('input', attrs={'name': '__EVENTVALIDATION'})['value']
        data = {
            '__VIEWSTATE': self.viewstate,
            '__EVENTVALIDATION': self.eventvalidation,
        }
        #### Third HTTP request (GET) Default Page
        self.response = self.session.get(self.login_action_url, data=payload)
        self.soup_3 = BeautifulSoup(self.response.text, 'html.parser')
        self.viewstate = self.soup_3.find('input', attrs={'name': '__VIEWSTATE'})['value']
        self.eventvalidation = self.soup_3.find('input', attrs={'name': '__EVENTVALIDATION'})['value']

        self.iframes = [item['src'] for item in self.soup_3.find_all('iframe') if item['src'] in ["Demanda.aspx?page=true", "Generacion.aspx?page=true", "Graficas.aspx?page=true"] ]
        self.page_date = [item.text for item in self.soup_3.find_all('span') if 'date' in item['id']][0]
        if self.response.ok == True:
            print("Logged in succesfully.")


    def get_date_state(self, date):
        """
        Take self.Session to a logged in state for a specific date.
        """
        self.date = date  
        data = {
            "ctl00_ContentPlaceHolder1_TabContainer1_ClientState": "{\"ActiveTabIndex\":0,\"TabState\":[true,true,true,true,true,true,true]}",
            '__VIEWSTATE': self.viewstate,
            '__EVENTVALIDATION': self.eventvalidation,
            "ctl00$txt_date": self.date,
            "ctl00$IMGB_GO.x": "39",
            "ctl00$IMGB_GO.y": "14",
        }

        self.response = self.session.post(self.login_action_url, data=data)
        self.soup = BeautifulSoup(self.response.text, 'html.parser')
        ##### Let's see if we need to go into this State or not... maybe we just need to repeat the __viewstate and __eventstate from the previous state.
        # self.viewstate = self.soup.find('input', attrs={'name': '__VIEWSTATE'})['value']
        # self.eventvalidation = self.soup.find('input', attrs={'name': '__EVENTVALIDATION'})['value']
        if self.response.ok == True:
            print(f"Page moved to {self.date} succesfully.")
        else:
            print("Error.")
                

    def get_all_raw_reports(self):
        self.get_Resumen_report()
        print(f'{self.date} Resumen Report loaded')
        self.get_report()
        print(f'{self.date} Generacion Report loaded')
        self.get_Demanda_report()
        print(f'{self.date} Demanda Report loaded')


    def get_Resumen_report(self):
        '''
        Downloading Resumen Report Tables in Raw format in ./outputs folder
        '''
        if not os.path.exists('outputs'):
            os.makedirs('outputs')
        # If we are going to parse them all we do not need the 'This Year', 'Last Week' columns, only TODAY.
        # Delete row "Station Temperature"
        # Delete row with SOUTHERN thingies.
        # Melt it all into a single numeric column
        self.Resumen_response = self.session.get(self.root_url + self.iframes[0])
        self.Resumen_soup = BeautifulSoup(self.Resumen_response.text, 'html.parser')
        if self.Resumen_soup.find('head').text.strip() == 'DataIsNotValidated':
            print("Data is not available yet.")
        else:
            df_list = pd.read_html(self.Resumen_soup.find('table').prettify())
            self.Resumen_list = []
            ### Main Resumen table
            df = df_list[3]
            df['date'] = self.date
            df.to_csv(f"./outputs/Resumen_report1_{self.date.replace('/','')}.csv")
            self.Resumen_list.append(df)
            ### Temp & Humid Green
            try:
                df = df_list[4]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report2_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            ### Temp & Humid Blue
            try:
                df = df_list[5]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report3_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            ### Cuzco Green
            # If we are going to parse them all we do not need the 'This Year', 'Last Week' columns, only TODAY.
            # Use the same columns from the main table.
            try:
                df = df_list[6]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report4_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            ### Cuzco Green
            try:
                df = df_list[7]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report5_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            ### Cuzco Blue
            # Do not need the 'This Year', 'Last Week' columns, only TODAY.
            # Use the same columns from the main table.
            try:
                df = df_list[8]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report6_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            ### Cuzco Blue
            try:
                df = df_list[9]
                df['date'] = self.date
                df.to_csv(f"./outputs/Resumen_report7_{self.date.replace('/','')}.csv")
                self.Resumen_list.append(df)
            except:
                pass
            print('Tables saved.') 


    def get_report(self):
        '''
        Downloading Generacion Report Tables in Raw format in ./outputs folder.
        '''
        if not os.path.exists('outputs'):
            os.makedirs('outputs')
        self.Generacion_response = self.session.get(self.root_url + self.iframes[1])
        self.Generacion_soup = BeautifulSoup(self.Generacion_response.text, 'html.parser')
        df_list = pd.read_html(self.Generacion_soup.find('table').prettify())
        self.Generacion_list = []
        ## Generacion at peak time by Plant 
        try:
            df = df_list[4]
            df['date'] = self.date
            df.to_csv(f"./outputs/report1_{self.date.replace('/','')}.csv")
            self.Generacion_list.append(df)
        except:
            pass
        ## Hourly Power production by Power Plant
        try:
            df = df_list[5]
            df['date'] = self.date
            df.to_csv(f"./outputs/report2_{self.date.replace('/','')}.csv")
            self.Generacion_list.append(df)
        except:
            pass
        print('Tables saved.')


    def get_Demanda_report(self):
        '''
        Downloading Load Report Tables in Raw format in ./outputs folder.
        '''
        if not os.path.exists('outputs'):
            os.makedirs('outputs')
        self.load_response = self.session.get(self.root_url + self.iframes[2])
        self.load_soup = BeautifulSoup(self.load_response.text, 'html.parser')
        df_list = pd.read_html(self.load_soup.find('table').prettify())
        self.load_list = []
        ## Hourly Power Load by Components
        try:
            df = df_list[4]
            df['date'] = self.date
            df.to_csv(f"./outputs/report21_{self.date.replace('/','')}.csv")
            self.load_list.append(df)
        except:
            pass
        print('Tables saved.') 


In [4]:
# Dates 
end = datetime.date.today() - datetime.timedelta(days=2)

dates_to_parse = pd.date_range(start='2024-06-07', end=end, freq='d')
dates_to_parse = dates_to_parse.strftime("%d/%m/%Y").to_list()
reversed_dates = list(reversed(dates_to_parse))

# URLs
root_url = 'http://coes.sin/reportes/'
login_page_url = 'http://coes.sin/reportes/Acceso.aspx'
login_action_url = 'http://coes.sin/reportes/Inicio.aspx'


### Parsing from newest to oldest:

In [5]:
pe_client = pe_Client(root_url, login_page_url, login_action_url)
pe_client.get_date_state(reversed_dates[0])

Logged in succesfully.
Page moved to 03/07/2024 succesfully.


In [6]:
pe_client.get_all_raw_reports()

Tables saved.
03/07/2024 Resumen Report loaded
Tables saved.
03/07/2024 Generacion Report loaded
Tables saved.
03/07/2024 Demanda Report loaded


In [7]:
for scrape_date in reversed_dates[1:]:
    pe_client.get_date_state(scrape_date)
    pe_client.get_all_raw_reports()

Page moved to 02/07/2024 succesfully.
Tables saved.
02/07/2024 Resumen Report loaded
Tables saved.
02/07/2024 Generacion Report loaded
Tables saved.
02/07/2024 Demanda Report loaded
Page moved to 01/07/2024 succesfully.
Tables saved.
01/07/2024 Resumen Report loaded
Tables saved.
01/07/2024 Generacion Report loaded
Tables saved.
01/07/2024 Demanda Report loaded
Page moved to 30/06/2024 succesfully.
Tables saved.
30/06/2024 Resumen Report loaded
Tables saved.
30/06/2024 Generacion Report loaded
Tables saved.
30/06/2024 Demanda Report loaded
Page moved to 29/06/2024 succesfully.
Tables saved.
29/06/2024 Resumen Report loaded
Tables saved.
29/06/2024 Generacion Report loaded
Tables saved.
29/06/2024 Demanda Report loaded
Page moved to 28/06/2024 succesfully.
Tables saved.
28/06/2024 Resumen Report loaded
Tables saved.
28/06/2024 Generacion Report loaded
Tables saved.
28/06/2024 Demanda Report loaded
Page moved to 27/06/2024 succesfully.
Tables saved.
27/06/2024 Resumen Report loaded
Table

### Parsing from oldest to newest:

In [21]:
pe_client = pe_Client(root_url, login_page_url, login_action_url)
pe_client.get_date_state(dates_to_parse[0])

Logged in succesfully.
Page moved to 24/04/2024 succesfully.


In [22]:
pe_client.get_all_raw_reports()

Tables saved.
24/04/2024 Resumen Report loaded
Tables saved.
24/04/2024 Generacion Report loaded
Tables saved.
24/04/2024 Demanda Report loaded


In [23]:
for scrape_date in dates_to_parse[1:]:
    pe_client.get_date_state(scrape_date)
    pe_client.get_all_raw_reports()

Page moved to 25/04/2024 succesfully.
Tables saved.
25/04/2024 Resumen Report loaded
Tables saved.
25/04/2024 Generacion Report loaded
Tables saved.
25/04/2024 Demanda Report loaded
Page moved to 26/04/2024 succesfully.
Tables saved.
26/04/2024 Resumen Report loaded
Tables saved.
26/04/2024 Generacion Report loaded
Tables saved.
26/04/2024 Demanda Report loaded
Page moved to 27/04/2024 succesfully.
Tables saved.
27/04/2024 Resumen Report loaded
Tables saved.
27/04/2024 Generacion Report loaded
Tables saved.
27/04/2024 Demanda Report loaded
Page moved to 28/04/2024 succesfully.
Tables saved.
28/04/2024 Resumen Report loaded
Tables saved.
28/04/2024 Generacion Report loaded
Tables saved.
28/04/2024 Demanda Report loaded
Page moved to 29/04/2024 succesfully.
Tables saved.
29/04/2024 Resumen Report loaded
Tables saved.
29/04/2024 Generacion Report loaded
Tables saved.
29/04/2024 Demanda Report loaded
Page moved to 30/04/2024 succesfully.
Tables saved.
30/04/2024 Resumen Report loaded
Table