In [1]:
import re
import csv
import pandas as pd
from os import walk
from unicodedata import normalize, combining

In [2]:
def convert_temp(temp):
    if type(temp) == str:
        return temp.replace(",",".")
    return temp

def format_name(name: str):
    """
    Format name to remove punctuation and spaces
    """
    name = re.sub("([^a-zà-üA-ZÀ-Ü0-9])", " ", name)
    nfkd_str = normalize("NFKD", name)
    name = "".join(
        [c for _, c in enumerate(nfkd_str) if not combining(c)]
    ).lower()
    while "  " in name:
        name = name.replace("   ", " ")
        name = name.replace("  ", " ")
    return name

## Preparação da base de dados para o ano de 2020

In [3]:
PATH = "../data/data_meteorological/"
columns = ["TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)"]
filenames = next(walk(PATH), (None, None, []))[2]

df_stations = pd.DataFrame()

for name in filenames:
    if '.DS_Store' not in name:
        df = pd.read_csv(f"{PATH}{name}", sep=";", header=8, encoding='latin-1')[columns]

        file = open(f"{PATH}{name}", encoding='latin-1')
        csvreader = csv.reader(file)
        rows = []
        for row in csvreader:
            rows.append(row)
        station_name = rows[2][0][9:]
        if len(rows[4]) > 1:
            latitude = float(rows[4][0][10:] + "." + rows[4][1])
        else:
            latitude = float(rows[4][0][10:])
        if len(rows[5]) > 1:
            longitude = float(rows[5][0][11:] + "." + rows[5][1])
        else:
            longitude = float(rows[5][0][11:])
        file.close()

        df["Municipio"] = station_name
        df.columns = ["Temperatura_media", "Municipio"]
        df["Temperatura_media"] = df["Temperatura_media"].apply(convert_temp).astype("float")
        df = df.groupby(["Municipio"], as_index=False)["Temperatura_media"].mean()
        
        df["Latitude"] = latitude
        df["Longitude"] = longitude

        df_stations = pd.concat([df_stations, df], ignore_index=True)

df_stations["Municipio"] = df_stations["Municipio"].apply(format_name)
df_stations = df_stations.dropna()

df_stations

Unnamed: 0,Municipio,Temperatura_media,Latitude,Longitude
0,guarda mor,23.356669,-17.561389,-47.199167
1,corumba,26.282203,-18.996667,-57.637500
2,vila velha,23.504628,-20.466944,-40.403889
3,tupa,23.506298,-21.927251,-50.490251
4,rio brilhante,23.428128,-21.774944,-54.528108
...,...,...,...,...
583,campos lindos,28.661224,-8.154722,-46.639444
584,barra,27.322538,-11.084722,-43.138889
585,salvador,25.592511,-13.005515,-38.505760
586,indaial,21.290087,-26.913611,-49.268056


In [4]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 492 entries, 0 to 587
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Municipio          492 non-null    object 
 1   Temperatura_media  492 non-null    float64
 2   Latitude           492 non-null    float64
 3   Longitude          492 non-null    float64
dtypes: float64(3), object(1)
memory usage: 19.2+ KB


In [5]:
df_stations.to_csv(f"../data/data_meteorological.csv")