# Collect Data

## Environment Setup
---

In [1]:
import requests
from bs4 import BeautifulSoup

## Requests and Beautiful Soup
---

In [166]:
BASE_URL = 'https://gero.icnea.net/Servidor.aspx'
PROPERTIES_LIST_URL = 'https://gero.icnea.net/HosEmpEstabliments.aspx'
from data.login import USER, PASSWORD # Confidential Data Stored Away from Git

In [98]:
# Create a Session object to store all data needed for scraping
session = requests.Session()
print(session.headers)
print(session.cookies)

{'User-Agent': 'python-requests/2.28.1', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}
<RequestsCookieJar[]>


In [99]:
# Header Values from My Browser, Microsoft Edge. It works fine with the site.
ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
ACCEPT_ENCODING = 'gzip, deflate, br'
ACCEPT_LANGUAGE = 'en-US,en;q=0.9,es;q=0.8,ca;q=0.7'
CACHE_CONTROL = 'no-cache'
DNT = '1'
PRAGMA = 'no-cache'
SEC_CH_UA = '" Not;A Brand";v="99", "Microsoft Edge";v="103", "Chromium";v="103"'
SEC_CH_UA_MOBILE = '?0'
SEC_CH_UA_PLATFORM = '"Windows"'
SEC_FETCH_DEST = 'document'
SEC_FETCH_MODE = 'navigate'
SEC_FETCH_SITE = 'none'
SEC_FETCH_USER = '?1'
UPGRADE_INSECURE_REQUESTS = '1'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'


In [100]:
# Create a headers dictionary to be used with our session
headers = {
    'accept': ACCEPT,
    'accept-encoding': ACCEPT_ENCODING,
    'accept-language': ACCEPT_LANGUAGE,
    'cache-control': CACHE_CONTROL,
    'dnt': DNT,
    'pragma': PRAGMA,
    'sec-ch-ua': SEC_CH_UA,
    'sec-ch-ua-platform': SEC_CH_UA_PLATFORM,
    'sec-fetch-dest': SEC_FETCH_DEST,
    'sec-fetch-site': SEC_FETCH_SITE,
    'sec-fecth_user': SEC_FETCH_USER,
    'upgrade-insecure-requests': UPGRADE_INSECURE_REQUESTS,
    'user-agent': USER_AGENT
}
print(headers)

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,es;q=0.8,ca;q=0.7', 'cache-control': 'no-cache', 'dnt': '1', 'pragma': 'no-cache', 'sec-ch-ua': '" Not;A Brand";v="99", "Microsoft Edge";v="103", "Chromium";v="103"', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-site': 'none', 'sec-fecth_user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'}


In [101]:
# Assign custom headers to the session
session.headers = headers
print(session.headers)

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,es;q=0.8,ca;q=0.7', 'cache-control': 'no-cache', 'dnt': '1', 'pragma': 'no-cache', 'sec-ch-ua': '" Not;A Brand";v="99", "Microsoft Edge";v="103", "Chromium";v="103"', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-site': 'none', 'sec-fecth_user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'}


In [108]:
# Get the BASE_URL to see what is in the HTML. In the response you can see two variables "_TSM_HiddenField_"  that are later passed as requests payloads.
response = session.get(BASE_URL)
print(response.status_code)
print(response.text)

200

<!DOCTYPE html>
<html>
<head>
    <title id="Title">© icnea</title>
    <meta charset="utf-8">
    <meta name="robots" content="noindex,nofollow">
    <meta name="author" content="icnea">
    <meta name="google" content="notranslate">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Open+Sans:300,400,600,700|Overpass">
    <link rel="stylesheet" href="assets/fonts/font-awesome/css/all.min.css">
    <link rel="stylesheet" href="assets/fonts/zocial/zocial.css">
    <link rel="stylesheet" href="assets/fonts/glyphicons-pro/glyphicons-pro.css">
    <link rel="stylesheet" href="assets/fonts/icomoon/icomoon.css">
    <link rel="stylesheet" href="assets/fonts/octicons/octicons.css">
    <link rel="stylesheet" href="assets/css/theme.css">
    <link rel="stylesheet" href="assets/css/admin-forms.css">
    <link rel="stylesheet" href="assets/css/estils.css">
    <link rel="shortcut icon" href="a

Inside the get response HTML there's a piece of code with some variables. It seems its a IIS technique to track sessions and if the values are not passed as payload in the request
the server simply ignores your request.

First parse the response HTML *scrap* those values and use them in later requests.

``` html
   <div class="aspNetHidden">
    <input id="_TSM_HiddenField_" name="_TSM_HiddenField_" type="hidden" value="encoded text"/>
    <input id="__EVENTTARGET" name="__EVENTTARGET" type="hidden" value=""/>
    <input id="__EVENTARGUMENT" name="__EVENTARGUMENT" type="hidden" value=""/>
    <input id="__VIEWSTATE" name="__VIEWSTATE" type="hidden" value="humongous text"/>
   </div>
```

In [144]:
# Parse the response of a request to get these values.
soup = BeautifulSoup(response.text, 'html.parser')
payload_match = soup.find_all('input', attrs={'type':'hidden'})
payload = {}
for input_tag in payload_match:
    if input_tag.has_attr('name') and input_tag.has_attr('value'):
        dict_key = input_tag.attrs['name']
        dict_value = input_tag.attrs['value']
        payload[dict_key] = dict_value
    else:
        pass
print(payload)

{'_TSM_HiddenField_': 'ZaMfZ6yYhPPHZ1NeEf8j6-t902-6pHpn2MehV0eep-I1', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': 'uUiMtb3NcR04QfueQe0MAMvmnxWy1dEvPfTpEKTkzkNExZDHhi4EyWUIn6aIY/IABvqCJPW2gLNrfnws4SSgImGc+RG+//LlmiIKvZIu9i06C3enmHRzF2FAMg+23Gin/GrScbwq5BRGizKCxwf+A1Ep/H661VJgOn6Xu+7HQOYoqvO/RK6wDw3STtwe7c5ERv62F4DRH84SELt3H8Sn8IeJGbWvYhOzoWOdXcTVFYzvxWZG1yvtDC9Zy+L6A/JNt7ArzHbCer7RGii3o9ft7+iAutfJH5mh9/dl0lL9IpZRGTNPQL3De7FWVudu/aH4lzJiE1HZOxl/BCn70s575iwWhSJTZ4FSLXJDfzooOUtBAdV1JClIZSlH6lbpHHEjdl66CqHVAnaKZZnQb9pDUY82OlRGOyNnTb4+pF2IGNaIXLUJ6ozmikS3tkQtyQtL7G6qf5ZTLDGdC0SVcNaP65tR5vfshtNyu+Gciot1SGH7cGBy87u8qiTEo0FlZntedC0vyFBP0pdptrQ7dZPcxWXGwVMbCctJAhAwhnUK0m0/lbAmY2dOyaS1PtHZg6f2yPSkzPMIDuo8cNKULqX07aHYzi9fSahCuG+2g0SLsDQjrI/e9M16/RcqIRF/tWJW0ILKszPPTjaxKq+4d6MR2rthlvZjg+n7bi1pdtGa8x7F2yhCeKgy0PYgrPL5XxWSLodqZ4JO/dSyqorgkmDnbPEnUj8ShC13jt6XPxMTegsyHksMztIAuHI/bSxxozdPjUcQ/mDERaJ5UbOKPbJVClXuC3nFKLAS2i32d4KHtltJeJnL4/mj7dcRraXygXy/XTrRdgoaBGxg/LQNwK7f767PLiIfvg5pIPhLxq

In [145]:
# And better turn it into a function for later use
def get_viewstate(response_text):
    '''
        Given an html text from a response parses the html in search of aspNetHidden
        values and return them as a dictionary so they can be used in the next request as payload
    '''
    soup = BeautifulSoup(response.text, 'html.parser')
    payload_match = soup.find_all('input', attrs={'type':'hidden'})
    payload = {}
    for input_tag in payload_match:
        if input_tag.has_attr('name') and input_tag.has_attr('value'):
            dict_key = input_tag.attrs['name']
            dict_value = input_tag.attrs['value']
            payload[dict_key] = dict_value
        else:
            pass
    return payload

In [148]:
# Check that it works as expected
payload = get_viewstate(response.text)
payload['Email'] = USER
payload['Contrasenya'] = PASSWORD
payload['Login'] = 'Accept'
print(payload)

{'_TSM_HiddenField_': 'ZaMfZ6yYhPPHZ1NeEf8j6-t902-6pHpn2MehV0eep-I1', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': 'uUiMtb3NcR04QfueQe0MAMvmnxWy1dEvPfTpEKTkzkNExZDHhi4EyWUIn6aIY/IABvqCJPW2gLNrfnws4SSgImGc+RG+//LlmiIKvZIu9i06C3enmHRzF2FAMg+23Gin/GrScbwq5BRGizKCxwf+A1Ep/H661VJgOn6Xu+7HQOYoqvO/RK6wDw3STtwe7c5ERv62F4DRH84SELt3H8Sn8IeJGbWvYhOzoWOdXcTVFYzvxWZG1yvtDC9Zy+L6A/JNt7ArzHbCer7RGii3o9ft7+iAutfJH5mh9/dl0lL9IpZRGTNPQL3De7FWVudu/aH4lzJiE1HZOxl/BCn70s575iwWhSJTZ4FSLXJDfzooOUtBAdV1JClIZSlH6lbpHHEjdl66CqHVAnaKZZnQb9pDUY82OlRGOyNnTb4+pF2IGNaIXLUJ6ozmikS3tkQtyQtL7G6qf5ZTLDGdC0SVcNaP65tR5vfshtNyu+Gciot1SGH7cGBy87u8qiTEo0FlZntedC0vyFBP0pdptrQ7dZPcxWXGwVMbCctJAhAwhnUK0m0/lbAmY2dOyaS1PtHZg6f2yPSkzPMIDuo8cNKULqX07aHYzi9fSahCuG+2g0SLsDQjrI/e9M16/RcqIRF/tWJW0ILKszPPTjaxKq+4d6MR2rthlvZjg+n7bi1pdtGa8x7F2yhCeKgy0PYgrPL5XxWSLodqZ4JO/dSyqorgkmDnbPEnUj8ShC13jt6XPxMTegsyHksMztIAuHI/bSxxozdPjUcQ/mDERaJ5UbOKPbJVClXuC3nFKLAS2i32d4KHtltJeJnL4/mj7dcRraXygXy/XTrRdgoaBGxg/LQNwK7f767PLiIfvg5pIPhLxq

Lets recap:
* We have a session
* We have custom headers
* We have captured basic cookies
* We have parsed the aspNetHidden thingy

Next try to properly authenticate an see what we get

In [157]:
with requests.session() as session:
    session.headers.update(headers)
    get_response = session.get(BASE_URL)
    payload = get_viewstate(get_response.text)
    post_response = session.post(BASE_URL,data=payload)
with open('test.html','w') as file:
    file.write(post_response.text)
# Still getting the login screen back.

## Selenium
---