In [1]:
import requests # to get the html markdown from the url
from bs4 import BeautifulSoup # to make the html readable
import pandas as pd # to save table in a csv
from datetime import datetime # to append current time into file name


# List of all IG urls

In [32]:
url = 'https://sinca.mma.gob.cl/'
html_data = requests.get(url)
soup = BeautifulSoup(html_data.text, 'html.parser')

# list of all regions link. Each region page has their stations link. From station link we get the data.
regiones = ['https://sinca.mma.gob.cl/'+x['href'] for x in soup.findAll('a')][15:-4]


In [3]:
# running time: 32s
# get list of urls. all station links

url_stations = []
for page in regiones:
    html_data = requests.get(page)
    soup = BeautifulSoup(html_data.text, 'html.parser')
    url_stations = url_stations + [x.findAll('a')[0]['href'] for x in soup.find_all('tbody')[0].find_all('tr')]
len(url_stations)

213

In [4]:
# running time 3m 30s

rows=[]

for region in url_stations:
    
    url = 'https://sinca.mma.gob.cl'+region
    html_data = requests.get(url)
    soup = BeautifulSoup(html_data.text, 'html.parser')
    
    estacion = soup.select('h1')[1].text.split(' ')[1:]
    estacion = ' '.join(estacion)

    GI_cols =   [ x.get_text().strip() for x in soup.find_all('th',{'class':'right'})]
    GI_values = [ x.get_text().strip() for x in soup.find_all('td',{'class':'left'}) ]

    row={}
    for idx in range(len(GI_values)):
        row.update({GI_cols[idx] : GI_values[idx]})
    row.update({'Estacion_id':int(region.split('/')[-1])})
    row.update({'Estacion':estacion})
    row.update({'URL':url})
    
    rows.append(row)

In [5]:
data = pd.DataFrame(rows)

a = data.columns.to_list()
a.remove('Estacion')
a.insert(0,'Estacion')
data = data[a]

data['Región'] = data['Región'].apply(lambda x: x.replace('de ',''))

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = "../SINCA MMA Gob - Historical Data/Información general " + date + " - Webscrape.csv"

data.sort_values('Estacion', inplace=True)

data.to_csv(filePath, index=True) # save to file path

data

Unnamed: 0,Estacion,Propietario,Operador,Región,Provincia,Comuna,Coordenadas UTM,Huso horario,Recepción de datos,Inicio de operación reportada,Estacion_id,URL,Fin de operación reportada
49,21 de Mayo,GUACOLDA ENERGIA S.A.,CESMEC S.A,Atacama,Huasco,Huasco,281938 E 6848939 N,19,en línea,2008-05-21,201,https://sinca.mma.gob.cl/index.php/estacion/in...,
172,21 de mayo,Sub Secretaría del Medio Ambiente,Algoritmos y Mediciones Ambientales SpA,del Biobío,Biobío,Los Angeles,733331 E 5849585 N,18,en línea,2012-04-14,236,https://sinca.mma.gob.cl/index.php/estacion/in...,
105,ARMAT,NO INFORMADO,NO INFORMADO,Valparaíso,Valparaíso,Quilpué,273138 E 6341755 N,19,carga manual,2002-01-17,5,https://sinca.mma.gob.cl/index.php/estacion/in...,
204,Alerce,Sub Secretaría del Medio Ambiente,Sub Secretaria del Medio Ambiente,los Lagos,Llanquihue,Puerto Montt,675585 E 5414803 N,19,en línea,2017-03-17,198,https://sinca.mma.gob.cl/index.php/estacion/in...,
1,Alto Hospicio,Ministerio del Medio Ambiente,Algoritmos y Mediciones Ambientales SpA,Tarapacá,Iquique,Alto Hospicio,385118 E 7755989 N,19,en línea,2015-12-17,157,https://sinca.mma.gob.cl/index.php/estacion/in...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,Vialidad,Ministerio del Medio Ambiente,Ministerio del Medio Ambiente,Aysén del General Carlos Ibáñez del Campo,Aysén,Aysén,680977 E 4969547 N,18,en línea,2017-10-20,166,https://sinca.mma.gob.cl/index.php/estacion/in...,
17,Villa Caspana,Codelco Distrito Norte,CIMM Tecnologías y Servicios S.A.,Antofagasta,El Loa,Calama,507410 E 7514595 N,19,carga manual,1994-05-02,3,https://sinca.mma.gob.cl/index.php/estacion/in...,2008-04-08
199,Vivero Los Castaños,Celulosa Arauco y Constitución,Luís Alexis Ortiz Romero,los Ríos,Valdivia,Máfil,660388 E 5601696 N,19,carga manual,2008-01-01,94,https://sinca.mma.gob.cl/index.php/estacion/in...,
113,Viña del Mar,Ministerio del Medio Ambiente,Algoritmos y Mediciones Ambientales SpA,Valparaíso,Valparaíso,Viña del Mar,261803 E 6343569 N,19,en línea,2004-07-01,247,https://sinca.mma.gob.cl/index.php/estacion/in...,


In [29]:
soup.find_all('tbody')[0]

<tbody>
<tr class="even">
<th class="right">Propietario</th>
<td class="left">Ministerio del Medio Ambiente</td>
</tr>
<tr class="odd">
<th class="right">Operador</th>
<td class="left">Algoritmos y Mediciones Ambientales SpA</td>
</tr>
<tr class="even">
<th class="right">Región</th>
<td class="left">de Magallanes y Antártica Chilena</td>
</tr>
<tr class="odd">
<th class="right">Provincia</th>
<td class="left">Magallanes</td>
</tr>
<tr class="even">
<th class="right">Comuna</th>
<td class="left">Punta Arenas</td>
</tr>
<tr class="odd">
<th class="right">Coordenadas UTM</th>
<td class="left">    371526 E 4108390 N
    </td>
</tr>
<tr class="even">
<th class="right">Huso horario</th>
<td class="left">    19    </td>
</tr>
<tr class="odd">
<th class="right">Recepción de datos</th>
<td class="left">en línea</td>
</tr>
<tr class="even">
<th class="right">Inicio de operación reportada</th>
<td class="left">2013-12-04</td>
</tr>
</tbody>

In [23]:
par_cols = [x.get_text() for x in soup.find('div',{'class':'stn-par'}).find_all('th')[:5]]

rows=[]
for parameter in par_cols:
    row={}
    row[par_cols[0]] = 1
    row[par_cols[1]] = 2
    row[par_cols[2]] = 3
    row[par_cols[3]] = 4
    row[par_cols[4]] = 5
    rows.append(row)

pd.DataFrame(rows)

Unnamed: 0,Parámetro,Fecha primer registro,Fecha último registro,Técnica de medición,Gráficos
0,1,2,3,4,5
1,1,2,3,4,5
2,1,2,3,4,5
3,1,2,3,4,5
4,1,2,3,4,5


In [51]:
par_cols = [x.get_text() for x in soup.find('div',{'class':'stn-par'}).find_all('th')[:5]]

rows=[]
for parameter in par_cols:
    row={}
    row[par_cols[0]] = 1
    row[par_cols[1]] = 2
    row[par_cols[2]] = 3
    row[par_cols[3]] = 4
    row[par_cols[4]] = 5
    rows.append(row)

pd.DataFrame(rows)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
PC_title = soup.find('div',{'class':'stn-par'}).find('h2').get_text().strip()
PC_cols = [ x.get_text() for x in soup.find_all('tr',{'class':'medicion'})[0] ]
PC_values = soup.find('table',{'id':'medicion'},{'class':'gob min'}).find_all('tr',{'class':'serie odd master'})

rows=[]
for idx,item in enumerate(PC_values):
    # row={}
    row[PC_cols[0]] = item.find('th').get_text().strip()
    row[PC_cols[1]] = item.find('td',{'class','center'}).text
    row[PC_cols[2]] = item.find_all('td',{'class','center'})[1].text
    row[PC_cols[3]] = item.find('td',{'class','helpTecnica center'}).text.strip()
    row[PC_cols[4]] = "https:"+soup.find('table',{'id':'medicion'},{'class':'gob min'}).find('tr',{'class':'serie odd master'}).find_all('td')[-1].find('a')['href']
    rows.append(row)

pd.DataFrame(rows)

In [None]:
PM_cols = [ x.get_text() for x in soup.find_all('tr',{'class':'medicion'})[1] ]
