## Este notebook faz parte da reportagem [Terremotos no Ceará? 7 gráficos sobre os tremores de terra no Estado](https://mais.opovo.com.br/reportagens-especiais/2023/05/18/terremotos-no-ceara-7-graficos-sobre-os-tremores-de-terra-no-estado.html?collection=reportagens_de_dados)

In [1]:
import pandas as pd

from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [2]:
page_info=urlopen('https://labsis.ufrn.br/boletins')

soup = BeautifulSoup(page_info, 'html.parser')

tables = soup.findAll("table")

In [3]:
tabelas_validas = []
dataframes = []
for table in tables:
  if table.findParent("table") is None:
    tabelas_validas.append(table)
    dataframes.append(pd.concat(pd.read_html(str(table))).iloc[1:,:])

In [4]:
df = pd.concat(dataframes)

In [5]:
df = df.loc[df[0] != 'Data']
df.columns = ['Data','Hora','Magnitude','Coordenadas','Local']

In [6]:
df

Unnamed: 0,Data,Hora,Magnitude,Coordenadas,Local
2,2023.10.05,09:40:12,2.1*,-9.85; -39.91,Jaguarari BA
3,2023.10.03,05:15:35,2.1*,-9.84; -39.87,Curaçá BA
4,2023.10.02,17:54:18,1.7*,-11.23; -40.52,Jacobina BA
5,2023.10.01,12:56:30,1.9*,-9.87; -39.93,Jaguarari BA
6,2023.10.01,01:35:01,1.7*,-8.28; -36.03,Caruaru PE
...,...,...,...,...,...
20,2020.12.08,03:22:27,1.6*,-8.05 S; -35.17 W,São Lourenço da Mata PE
21,2020.12.04,07:08:30,3.9*,-3.09 S; -44.55 W,Itapecuru-Mirim MA
22,2020.12.02,11:42:26,1.5*,-6.79 S; -36.12 W,Barra de Santa Rosa PB
23,2020.12.01,15:47:00,1.5*,-13.06 S; -39.49 W,São Miguel das Matas BA


In [7]:
df = df.drop_duplicates()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415 entries, 2 to 25
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Data         1415 non-null   object
 1   Hora         1405 non-null   object
 2   Magnitude    1405 non-null   object
 3   Coordenadas  1405 non-null   object
 4   Local        1405 non-null   object
dtypes: object(5)
memory usage: 66.3+ KB


In [9]:
df.loc[df.Hora.isna()]

Unnamed: 0,Data,Hora,Magnitude,Coordenadas,Local
3,2023.05.31,,,,
4,05:03:16,,,,
5,-8.39; -36.33,,,,
3,23:55:22,,,,
4,2.1*,,,,
5,-9.9 S; -39.88 W,,,,
7,15:42:17,,,,
8,1.9*,,,,
9,-9.92 S; -36.76 W,,,,
10,Girau do Ponciano AL,,,,


In [10]:
df.loc[df.Local == 'Girau do Ponciano AL']

Unnamed: 0,Data,Hora,Magnitude,Coordenadas,Local
6,2022.03.31,15:42:17,1.9*,-9.92 S; -36.76 W,Girau do Ponciano AL
17,2022.02.17,15:01:59,1.7*,-9.99 S; -36.73 W,Girau do Ponciano AL
3,2021.12.30,14:59:17,1.5*,-9.76 S; -36.79 W,Girau do Ponciano AL


In [11]:
df = df.loc[~df.Hora.isna()]

In [12]:
df

Unnamed: 0,Data,Hora,Magnitude,Coordenadas,Local
2,2023.10.05,09:40:12,2.1*,-9.85; -39.91,Jaguarari BA
3,2023.10.03,05:15:35,2.1*,-9.84; -39.87,Curaçá BA
4,2023.10.02,17:54:18,1.7*,-11.23; -40.52,Jacobina BA
5,2023.10.01,12:56:30,1.9*,-9.87; -39.93,Jaguarari BA
6,2023.10.01,01:35:01,1.7*,-8.28; -36.03,Caruaru PE
...,...,...,...,...,...
21,2020.12.08,03:22:27,1.6*,-8.05 S; -35.17 W,São Lourenço da Mata PE
22,2020.12.04,07:08:30,3.9*,-3.09 S; -44.55 W,Itapecuru-Mirim MA
23,2020.12.02,11:42:26,1.5*,-6.79 S; -36.12 W,Barra de Santa Rosa PB
24,2020.12.01,15:47:00,1.5*,-13.06 S; -39.49 W,São Miguel das Matas BA


In [13]:
df['dtg'] = pd.to_datetime(df['Data'], format='%Y.%m.%d')

In [14]:
df['Ano'] = df.dtg.dt.year
df['Mês'] = df.dtg.dt.month
df['Dia'] = df.dtg.dt.day

In [15]:
df['UF'] = df.Local.str[-2:]

In [16]:
df.UF.value_counts()

BA    475
CE    277
RN    228
PE    199
AL    120
PB     48
SE     38
PI      5
MA      3
PA      3
Al      2
co      2
AC      2
TO      1
AM      1
an      1
Name: UF, dtype: int64

In [17]:
df.loc[(df.UF=='co')|(df.UF=='an')]

Unnamed: 0,Data,Hora,Magnitude,Coordenadas,Local,dtg,Ano,Mês,Dia,UF
57,2021.03.03,03:43:26,3.1*,-16.94 S; -38.08 W,Sul do Oceano Atlântico,2021-03-03,2021,3,3,co
58,2021.03.03,03:35:17,3.5*,-16.89 S; -37.32 W,Sul do Oceano Atlântico,2021-03-03,2021,3,3,co
10,2020.10.27,16:09:14,2.3*,-14.25 S; -38.79 W,Mar da Bahia Ocean,2020-10-27,2020,10,27,an


In [18]:
df.loc[(df.UF=='co')|(df.UF=='an'),'UF']='Indefinido'

In [19]:
df['latitude'] = df.Coordenadas.apply(lambda x: float(x.split(';')[0].replace('S','').replace('N','')))
df['latitude'] = df.latitude.apply(lambda x: x*-1 if x > 0 else x)

In [20]:
df['longitude'] = df.Coordenadas.apply(lambda x: float(x.split(';')[1].replace('W','').replace('E','')))

In [21]:
df['obs'] = ''
df.loc[df.Magnitude.str.contains("\*"),'Obs'] = 'Magnitude não revisada/preliminar'

In [22]:
df.Magnitude = df.Magnitude.apply(lambda x: float(x.replace('*','')))

In [23]:
def remove_sigla_uf(nome):
  sigla = nome[-2:]
  if sigla.isupper():
    return nome.replace(sigla,'').rstrip()
  return nome

In [24]:
remove_sigla_uf(df.iloc[0].Local)

'Jaguarari'

In [25]:
df['Local'] = df['Local'].apply(lambda x: remove_sigla_uf(x))

In [26]:
df_tratado = df[['dtg','Ano','Mês','Dia','UF','Local','latitude','longitude','Magnitude','Obs']]

In [27]:
df_tratado

Unnamed: 0,dtg,Ano,Mês,Dia,UF,Local,latitude,longitude,Magnitude,Obs
2,2023-10-05,2023,10,5,BA,Jaguarari,-9.85,-39.91,2.1,Magnitude não revisada/preliminar
3,2023-10-03,2023,10,3,BA,Curaçá,-9.84,-39.87,2.1,Magnitude não revisada/preliminar
4,2023-10-02,2023,10,2,BA,Jacobina,-11.23,-40.52,1.7,Magnitude não revisada/preliminar
5,2023-10-01,2023,10,1,BA,Jaguarari,-9.87,-39.93,1.9,Magnitude não revisada/preliminar
6,2023-10-01,2023,10,1,PE,Caruaru,-8.28,-36.03,1.7,Magnitude não revisada/preliminar
...,...,...,...,...,...,...,...,...,...,...
21,2020-12-08,2020,12,8,PE,São Lourenço da Mata,-8.05,-35.17,1.6,Magnitude não revisada/preliminar
22,2020-12-04,2020,12,4,MA,Itapecuru-Mirim,-3.09,-44.55,3.9,Magnitude não revisada/preliminar
23,2020-12-02,2020,12,2,PB,Barra de Santa Rosa,-6.79,-36.12,1.5,Magnitude não revisada/preliminar
24,2020-12-01,2020,12,1,BA,São Miguel das Matas,-13.06,-39.49,1.5,Magnitude não revisada/preliminar


In [29]:
df_tratado.to_csv('resultados/coleta_op_tremores_ne_labSis.csv')