# EDA

In [1]:
import boto3
import geopandas as gpd
import io
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from shapely.geometry import Point
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from unidecode import unidecode

In [2]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"],
    region_name="us-west-1"
)

s3_resource = boto3.resource(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
)

In [3]:
def remove_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR

    df_filtrado = df[(df[feature] > limite_inferior) & (df[feature] < limite_superior)]
    return df_filtrado

In [4]:
# getting preprocessed data
stations = []
ts_min = []
ts_max = []

df_full = []
prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
keys = [obj.key for obj in prefix_objs]
for key in keys[1:]:
    if key not in ['processed/cge_clusters.csv', 'processed/cluster_A.csv', 'processed/cluster_B.csv', 'processed/cluster_C.csv', 'processed/cluster_D.csv']:
        obj = s3_client.get_object(Bucket="cge", Key=key)
        df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df["date"] = df.timestamp.dt.date
        df = remove_outliers(df=df, feature='temperature_lag_1H')
            
        try:
            colunas_media = [
                "precipitation_lag_1H",
                "temperature_lag_1H",
                "relative_humidity_lag_1H",
                "pressure_lag_1H",
                "wind_velocity_x_lag_1H",
                "wind_velocity_y_lag_1H",
                "wind_blow_x_lag_1H",
                "wind_blow_y_lag_1H"
            ]
            
            df_grouped = df[["station_name", "date", "temperature_lag_1H"]].groupby(["station_name", "date"]).max()
            df_grouped = df_grouped.rename(columns={"temperature_lag_1H": "temperature_max_lag_1H"})
            for coluna in colunas_media:
                df_grouped[coluna] = df.groupby(["station_name", "date"])[coluna].mean()
            
            df_grouped = df_grouped.reset_index()
            df_grouped = df_grouped.drop(columns=['date'])
            df_grouped = df_grouped.groupby("station_name", as_index=False).mean()
        except:
            colunas_media = [
                "precipitation_lag_1H",
                "temperature_lag_1H",
                "relative_humidity_lag_1H",
                "pressure_lag_1H"
            ]

            df_grouped = df[["station_name", "date", "temperature_lag_1H"]].groupby(["station_name", "date"]).max()
            df_grouped = df_grouped.rename(columns={"temperature_lag_1H": "temperature_max_lag_1H"})
            for coluna in colunas_media:
                df_grouped[coluna] = df.groupby(["station_name", "date"])[coluna].mean()
            
            df_grouped = df_grouped.reset_index()
            df_grouped = df_grouped.drop(columns=['date'])
            df_grouped = df_grouped.groupby("station_name", as_index=False).mean()

        stations.append(df.station_name.unique()[0])
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        ts_min.append(df['timestamp'].min())
        ts_max.append(df['timestamp'].max())

        df_full.append(df_grouped)

df_grouped = pd.concat(df_full)
# df_grouped.to_csv('df_grouped.csv', index=False)
df_grouped

Unnamed: 0,station_name,temperature_max_lag_1H,precipitation_lag_1H,temperature_lag_1H,relative_humidity_lag_1H,pressure_lag_1H,wind_velocity_x_lag_1H,wind_velocity_y_lag_1H,wind_blow_x_lag_1H,wind_blow_y_lag_1H
0,Anhembi,25.768794,1.86034,20.799166,75.495413,930.83013,-0.109271,0.431419,-0.389286,0.775412
0,Butantã,26.824583,2.031074,20.445426,80.674328,933.046437,0.021682,0.003436,0.036568,0.034553
0,Campo Limpo,26.142013,1.704243,20.769885,78.881693,931.805123,-0.226799,0.213843,-0.541806,0.612968
0,Capela do Socorro,25.68003,1.739865,18.89356,89.509712,934.908132,,,,
0,Cidade Ademar,25.976241,1.771956,20.601012,76.677048,930.910506,0.182938,-0.399232,0.38062,-0.731789
0,Freguesia do Ó,26.614054,2.104188,20.627032,79.192017,927.135188,-0.204011,0.027599,-0.816676,0.064039
0,Ipiranga,25.570956,1.879577,20.173128,81.060309,930.197752,,,,
0,Itaim Paulista,26.111756,1.808285,20.263566,81.01576,935.127296,0.086187,0.043551,0.469129,0.264577
0,Itaquera,25.51341,1.845294,19.936505,81.154465,933.188061,,,,
0,Jabaquara,24.368531,2.063883,19.194426,73.223958,930.661411,,,,


In [5]:
# df_grouped = pd.read_csv('df_grouped.csv')
df_grouped = df_grouped[df_grouped["station_name"].str.contains("Parelheiros") == False]
df_grouped.columns = [i.split("_lag_1H")[0] for i in list(df_grouped.columns)]
df_grouped = df_grouped.rename(columns={"station_name": "station"})
df_grouped = df_grouped.drop(columns=["temperature_max", "pressure", "wind_velocity_x", "wind_velocity_y", "wind_blow_x", "wind_blow_y"])

df_grouped

Unnamed: 0,station,precipitation,temperature,relative_humidity
0,Anhembi,1.86034,20.799166,75.495413
0,Butantã,2.031074,20.445426,80.674328
0,Campo Limpo,1.704243,20.769885,78.881693
0,Capela do Socorro,1.739865,18.89356,89.509712
0,Cidade Ademar,1.771956,20.601012,76.677048
0,Freguesia do Ó,2.104188,20.627032,79.192017
0,Ipiranga,1.879577,20.173128,81.060309
0,Itaim Paulista,1.808285,20.263566,81.01576
0,Itaquera,1.845294,19.936505,81.154465
0,Jabaquara,2.063883,19.194426,73.223958


In [6]:
stations = pd.read_csv("estacoes_meteorologicas_cge.csv")
stations[['station', 'lat', 'lon']].to_latex('stations_table.tex', index=False)

stations

  stations[['station', 'lat', 'lon']].to_latex('stations_table.tex', index=False)


Unnamed: 0,station,lat,lon
0,Sé,-23.552718,-46.656168
1,Anhembi,-23.518626,-46.643757
2,Vila Mariana,-23.58472,-46.63556
3,Ipiranga,-23.632978,-46.583518
4,Campo Limpo,-23.65818,-46.76749
5,M'Boi Mirim,-23.671266,-46.727361
6,Santo Amaro,-23.634789,-46.667657
7,Jabaquara,-23.650814,-46.646581
8,Cidade Ademar,-23.66767,-46.67416
9,Capela do Socorro,-23.781133,-46.725217


In [14]:
def get_uhct_radius_percentage(uhct_feature):
    
    df_uhct = []
    local_folder = "/tmp/UHCT_112015_v2"

    # Certifique-se de que o diretório local exista, caso contrário, crie-o.
    if not os.path.exists(local_folder):
        os.makedirs(local_folder)

    response = s3_client.list_objects(Bucket="cge", Prefix="shps/UHCT_112015_v2/")
    for obj in response["Contents"]:
        # Verifique se o objeto é um arquivo e ignore os diretórios.
        if obj["Key"][-1] != "/":
            local_file_path = os.path.join(local_folder, os.path.basename(obj["Key"]))
            s3_client.download_file("cge", obj["Key"], local_file_path)

    for i in range(0, len(stations)):
        # Carregar o shapefile
        shapefile_path = os.path.join(local_folder)
        gdf = gpd.read_file(shapefile_path)

        # Coordenadas da estação meteorológica
        latitude = stations['lat'][i]
        longitude = stations['lon'][i]
        station_point = Point(longitude, latitude)

        # Criar um buffer de 500 metros ao redor da estação meteorológica
        buffer_radius = 500  # metros
        buffer = station_point.buffer(buffer_radius / 111320)  # aproximadamente 1 grau = 111.32km

        # Selecionar polígonos que interseccionam com o buffer
        buffer_gdf = gpd.GeoDataFrame(geometry=[buffer], crs=gdf.crs)
        intersected_polygons = gpd.overlay(gdf, buffer_gdf, how="intersection")

        # Calcular a área de cada polígono e a área total no buffer
        intersected_polygons["area"] = intersected_polygons.geometry.area * 111320**2
        total_area = intersected_polygons["area"].sum()

        # Calcular a porcentagem de uso e ocupação do solo
        percentages = (intersected_polygons.groupby(uhct_feature)["area"].sum() / total_area) * 100

        df = pd.DataFrame(percentages).reset_index()

        df_pivot = pd.pivot_table(
            df, 
            values='area',
            columns=[uhct_feature]
        ).reset_index()

        df_pivot['station'] = stations['station'][i]
        df_uhct.append(df_pivot)

    os.remove(shapefile_path)
    uhct_perc = pd.concat(df_uhct)
    return uhct_perc

uhct_perc = get_uhct_radius_percentage(uhct_feature="Uso_ocup")

OSError: [Errno 28] No space left on device

In [None]:
uhct_perc = uhct_perc.drop(columns=[
    "agua", 
    "grandes equipamentos",
    "loteamento",
    "mata",
    "area desocupada",
    "index"
])

uhct_perc = uhct_perc.set_index("station").reset_index("station")

uhct_perc

In [None]:
df_grouped = df_grouped.merge(
    uhct_perc,
    on="station",
    how="outer"
).fillna(0)

df_grouped = df_grouped.merge(
    stations,
    on="station",
    how="outer"
)

df_grouped = df_grouped[df_grouped["station"].str.contains("Parelheiros") == False]

df_grouped

In [None]:
obj = s3_client.get_object(Bucket="cge", Key='processed/parelheiros.csv')
df_parelheiros = pd.read_csv(io.BytesIO(obj["Body"].read()))
df_parelheiros = df_parelheiros[["timestamp", "temperature"]].rename(columns={"temperature": "temperature_rural"}).dropna()
df_parelheiros = remove_outliers(df=df_parelheiros, feature='temperature_rural')

df_parelheiros

In [None]:
# Preparar os dados
X = df_grouped[[
    'temperature',
    'precipitation',
    'relative_humidity',
    'residencial comercial servicos',
    'espaco verde urbano'
]].iloc[:, :].values  # Selecionar todas as colunas do conjunto de dados

# Normalizar os dados
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

# Aplicar PCA para redução de dimensionalidade
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)

# Encontrar o número ideal de clusters utilizando o método Elbow
n_clusters = range(1, 11)
inertias = []
for n in n_clusters:
    kmeans = KMeans(n_clusters=n, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X_pca)
    inertias.append(kmeans.inertia_)

# Plotar o gráfico do método Elbow
plt.subplots(figsize=(6, 6))
plt.plot(n_clusters, inertias, 'bo-')
plt.xlabel('Número de Clusters')
plt.ylabel('Inércia')
plt.title('Método do Cotovelo')
plt.grid()
plt.savefig(f'../figures/elbow_method-pt.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Clusterizar os dados com o número de clusters escolhido
k = 4  # Número de clusters escolhido
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X_pca)

centroids = kmeans.cluster_centers_
print(centroids)

kmeans_output = pd.DataFrame(
    data=X_pca,
    index=df_grouped.station
    )

kmeans_output["cluster"] = y_kmeans

# Plotar os clusters encontrados
plt.subplots(figsize=(6, 6))

plt.scatter(
    kmeans_output[kmeans_output.cluster == 1][0],
    kmeans_output[kmeans_output.cluster == 1][1],
    marker="^",
    s=80,
    label='Cluster A',
    facecolors='none',
    edgecolors='red'
)

plt.scatter(
    kmeans_output[kmeans_output.cluster == 0][0], 
    kmeans_output[kmeans_output.cluster == 0][1], 
    marker="s", 
    s=80, 
    label='Cluster B',
    facecolors='none',
    edgecolors='deepskyblue'
)

plt.scatter(
    kmeans_output[kmeans_output.cluster == 2][0],
    kmeans_output[kmeans_output.cluster == 2][1],
    marker="D",
    s=80,
    label='Cluster C',
    facecolors='none',
    edgecolors='magenta'
)

plt.scatter(
    kmeans_output[kmeans_output.cluster == 3][0],
    kmeans_output[kmeans_output.cluster == 3][1],
    marker="o",
    s=80,
    label='Cluster D',
    facecolors='none',
    edgecolors='lime'
)

for idx, row in kmeans_output.iterrows(): 
    plt.text(row[0]+.06, row[1]+.06, idx, size=8)

plt.scatter(centroids[0][0], centroids[0][1], marker="+", s=200, c='black', label='Centróide do cluster')
plt.scatter(centroids[0][0], centroids[0][1], marker="+", s=200, c='deepskyblue', label='_nolegend_')
plt.scatter(centroids[1][0], centroids[1][1], marker="+", s=200, c='red', label='_nolegend_')
plt.scatter(centroids[2][0], centroids[2][1], marker="+", s=200, c='magenta', label='_nolegend_')
plt.scatter(centroids[3][0], centroids[3][1], marker="+", s=200, c='lime', label='_nolegend_')


plt.xlabel('Componente 1')
plt.ylabel('Componente 2')
plt.title('Clusterização com K-Means')
plt.grid()
plt.legend(prop={'size': 10})
plt.savefig(f'../figures/clusters-pt.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
df_grouped = df_grouped.merge(
    kmeans_output.reset_index()[["station", "cluster"]],
    on="station"
)

df_grouped['cluster'] = df_grouped['cluster'].map({
    0: "B", 
    1: "A", 
    2: "C",
    3: "D"
})

df_grouped

In [None]:
# writing data to S3 bucket
buffer = io.StringIO()
df_grouped[["station", "cluster"]].to_csv(buffer)
s3_resource.Object("cge", f"processed/cge_clusters.csv").put(Body=buffer.getvalue())

In [None]:
import contextily as ctx

masp_cities = [
    "Mauá",  
    "São Bernardo do Campo",  
    "Santana de Parnaíba", 
]

epsg = 4326

sp = gpd.read_file('SP_Municipios_2022')
sp = sp.to_crs(f'EPSG:{epsg}')
sp_masp_cities = sp[sp.NM_MUN.isin(masp_cities)]

sp_cidade = gpd.read_file('LAYER_DISTRITO')
sp_cidade = sp_cidade.to_crs(f'EPSG:{epsg}')

weather_stations_cluster_A = list(df_grouped[df_grouped.cluster == "A"][["lat", "lon"]].itertuples(index=False, name=None))
weather_stations_cluster_B = list(df_grouped[df_grouped.cluster == "B"][["lat", "lon"]].itertuples(index=False, name=None))
weather_stations_cluster_C = list(df_grouped[df_grouped.cluster == "C"][["lat", "lon"]].itertuples(index=False, name=None))
weather_stations_cluster_D = list(df_grouped[df_grouped.cluster == "D"][["lat", "lon"]].itertuples(index=False, name=None))

# Crie um GeoDataFrame com as coordenadas das estações meteorológicas e atribua o CRS (EPSG: 4326)
weather_stations_gdf_cluster_A = gpd.GeoDataFrame(geometry=gpd.points_from_xy([x[1] for x in weather_stations_cluster_A], [x[0] for x in weather_stations_cluster_A]), crs=f"EPSG:{epsg}")
weather_stations_gdf_cluster_B = gpd.GeoDataFrame(geometry=gpd.points_from_xy([x[1] for x in weather_stations_cluster_B], [x[0] for x in weather_stations_cluster_B]), crs=f"EPSG:{epsg}")
weather_stations_gdf_cluster_C = gpd.GeoDataFrame(geometry=gpd.points_from_xy([x[1] for x in weather_stations_cluster_C], [x[0] for x in weather_stations_cluster_C]), crs=f"EPSG:{epsg}")
weather_stations_gdf_cluster_D = gpd.GeoDataFrame(geometry=gpd.points_from_xy([x[1] for x in weather_stations_cluster_D], [x[0] for x in weather_stations_cluster_D]), crs=f"EPSG:{epsg}")

# Converta as coordenadas das estações meteorológicas para o sistema de coordenadas Web Mercator (EPSG: 3857)
weather_stations_gdf_cluster_A_projected = weather_stations_gdf_cluster_A.to_crs(epsg=epsg)
weather_stations_gdf_cluster_B_projected = weather_stations_gdf_cluster_B.to_crs(epsg=epsg)
weather_stations_gdf_cluster_C_projected = weather_stations_gdf_cluster_C.to_crs(epsg=epsg)
weather_stations_gdf_cluster_D_projected = weather_stations_gdf_cluster_D.to_crs(epsg=epsg)

# Plote o mapa da RMSP
fig, ax = plt.subplots(figsize=(10, 10))
sp_masp_cities.plot(ax=ax, edgecolor='black', facecolor='none')
sp_cidade.plot(ax=ax, edgecolor='black', facecolor='none')

# Adicione a imagem de satélite usando o Contextily
ctx.add_basemap(
    ax, 
    source=ctx.providers.Esri.WorldImagery, 
    # zoom="auto", 
    crs=f"EPSG:{epsg}"
)

# Plote os pontos das estações meteorológicas
ax.scatter(
    weather_stations_gdf_cluster_A_projected.geometry.x, 
    weather_stations_gdf_cluster_A_projected.geometry.y,
    marker="^",
    s=100,
    label='Cluster A',
    facecolors='none',
    edgecolors='red'
)

ax.scatter(
    weather_stations_gdf_cluster_B_projected.geometry.x, 
    weather_stations_gdf_cluster_B_projected.geometry.y, 
    marker="s",
    s=100,
    label='Cluster B',
    facecolors='none',
    edgecolors='deepskyblue'
)

ax.scatter(
    weather_stations_gdf_cluster_C_projected.geometry.x, 
    weather_stations_gdf_cluster_C_projected.geometry.y,
    marker="D",
    s=100,
    label='Cluster C',
    facecolors='none',
    edgecolors='magenta'
)

ax.scatter(
    weather_stations_gdf_cluster_D_projected.geometry.x, 
    weather_stations_gdf_cluster_D_projected.geometry.y,
    marker="o",
    s=100,
    label='Cluster D',
    facecolors='none',
    edgecolors='lime'
)

# Plote da estação de referência rural
ax.scatter(
    -46.652222, 
    -23.867778, 
    marker="x",
    s=100,
    label='Rural',
    color='yellow'
)

# Ajuste os limites do gráfico para mostrar apenas a RMSP
ax.set_xlim(sp_masp_cities.bounds.minx.min(), sp_masp_cities.bounds.maxx.max())
ax.set_ylim(sp_masp_cities.bounds.miny.min(), sp_masp_cities.bounds.maxy.max())

# Adicione títulos e rótulos
ax.set_title('Região Metropolitana de São Paulo', fontdict={'fontsize': 15})
ax.set_xlabel('Lon')
ax.set_ylabel('Lat')

# Adicione a legenda
ax.legend()
ax.grid()

for idx, row in stations.set_index('station').iterrows(): 
    if idx not in ['Itaim Paulista', 'São Miguel Paulista']:
        plt.text(
            row['lon']+0.0075, 
            row['lat']+0.0075, 
            idx, 
            size=8, 
            color='white'
        )
    elif idx == 'Itaim Paulista':
        plt.text(
            row['lon']-0.06, 
            row['lat']+0.0075, 
            idx, 
            size=8, 
            color='white'
        )
    elif idx == 'São Miguel Paulista':
        plt.text(
            row['lon']-0.085, 
            row['lat']-0.015, 
            idx, 
            size=8, 
            color='white'
        )

# Exiba o gráfico
plt.savefig(f'../figures/masp_clusters-pt.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
df_grouped_boxplot = df_grouped[[
    'temperature',
    'precipitation',
    'relative_humidity',
    'residencial comercial servicos',
    'espaco verde urbano',
    'cluster'
]]

df_grouped_boxplot.columns = [
    "Temperatura",  
    "Precipitação",
    "Umidade relativa",
    'Residencial/Comercial/Serviços',
    'Áreas verdes urbanas',
    "cluster"
]

vars = list(df_grouped_boxplot.drop("cluster", axis=1).columns)

ncols = len(df_grouped_boxplot.drop(columns=['cluster'], axis=1).columns)
fig, axs = plt.subplots(nrows=1, ncols=ncols, figsize=(8, 3))
p = 0

for i in vars:
    # Lista com os grupos de dados
    
    dados = [
        df_grouped_boxplot[df_grouped_boxplot.cluster == "A"][i], 
        df_grouped_boxplot[df_grouped_boxplot.cluster == "B"][i], 
        df_grouped_boxplot[df_grouped_boxplot.cluster == "C"][i],
        df_grouped_boxplot[df_grouped_boxplot.cluster == "D"][i]
        ]

    # Nomes dos grupos
    nomes_grupos = [
        'A', 
        'B', 
        'C',
        'D'
    ]

    # Criar o boxplot
    axs[p].boxplot(dados, labels=nomes_grupos)

    # Adicionar título e rótulos dos eixos
    parameter = i

    if parameter == "Umidade relativa":
        parameter = "Umidade\nrelativa"
    elif parameter == "Residencial/Comercial/Serviços":
        parameter = "Residencial/\nComercial/\nServiços"
    elif parameter == "Áreas verdes urbanas":
        parameter = "Áreas verdes\nurbanas"
    
    axs[p].set_title(f'{parameter}', fontdict={'fontsize': 10})
    axs[p].set_xlabel('')
    
    if parameter == "Precipitação":
        ylabel = "mm"
    elif parameter == "Temperatura":
        ylabel = "°C"
    else:
        ylabel = "%"
    
    axs[p].set_ylabel(ylabel)

    p += 1

fig.subplots_adjust(
    left=0.08, 
    right=0.98, 
    bottom=0.05, 
    top=0.9,
    wspace=1
)

plt.savefig(f'../figures/boxplot_clusters_parameters-pt.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# df_grouped_boxplot.drop(columns=["cluster"]).describe()
df_grouped_boxplot.groupby("cluster")["Temperatura"].describe()

In [None]:
# df_grouped_boxplot.drop(columns=["cluster"]).describe()
df_grouped_boxplot.groupby("cluster")["Precipitação"].describe()

In [None]:
# df_grouped_boxplot.drop(columns=["cluster"]).describe()
df_grouped_boxplot.groupby("cluster")["Umidade relativa"].describe()

In [None]:
# df_grouped_boxplot.drop(columns=["cluster"]).describe()
df_grouped_boxplot.groupby("cluster")["Residencial/Comercial/Serviços"].describe()

In [None]:
# df_grouped_boxplot.drop(columns=["cluster"]).describe()
df_grouped_boxplot.groupby("cluster")["Áreas verdes urbanas"].describe()

In [None]:
# getting preprocessed data
df_full_0 = []
df_full_1 = []
df_full_2 = []
df_full_3 = []
prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
keys = [obj.key for obj in prefix_objs]
for key in keys[1:]:
    if key not in ['processed/cge_clusters.csv', 'processed/cluster_A.csv', 'processed/cluster_B.csv', 'processed/cluster_C.csv', 'processed/cluster_D.csv']:
        obj = s3_client.get_object(Bucket="cge", Key=key)
        df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        df = remove_outliers(df=df, feature='temperature_lag_1H')
        
        try:
            df_2 = df[[
                "timestamp",
                "station_name",
                "precipitation_lag_1H",
                "temperature_lag_1H",
                "relative_humidity_lag_1H",
                "pressure_lag_1H",
                "wind_velocity_x_lag_1H",
                "wind_velocity_y_lag_1H",
                "wind_blow_x_lag_1H",
                "wind_blow_y_lag_1H"
            ]]
        except:
            df_2 = df[[
                "timestamp",
                "station_name",
                "precipitation_lag_1H",
                "temperature_lag_1H",
                "relative_humidity_lag_1H",
                "pressure_lag_1H"
            ]]
        

        if df_2.station_name.unique()[0] in list(df_grouped[df_grouped.cluster == "A"].station): 
            df_full_0.append(df_2)
        if df_2.station_name.unique()[0] in list(df_grouped[df_grouped.cluster == "B"].station): 
            df_full_1.append(df_2)
        if df_2.station_name.unique()[0] in list(df_grouped[df_grouped.cluster == "C"].station): 
            df_full_2.append(df_2)
        if df_2.station_name.unique()[0] in list(df_grouped[df_grouped.cluster == "D"].station): 
            df_full_3.append(df_2)

df_cluster_A = pd.concat(df_full_0)
# df_cluster_A.to_csv("df_cluster_A.csv", index=False)

df_cluster_B = pd.concat(df_full_1)
# df_cluster_B.to_csv("df_cluster_B.csv", index=False)

df_cluster_C = pd.concat(df_full_2)
# df_cluster_C.to_csv("df_cluster_C.csv", index=False)

df_cluster_D = pd.concat(df_full_3)
# df_cluster_D.to_csv("df_cluster_D.csv", index=False)

In [None]:
# df_cluster_A = pd.read_csv("df_cluster_A.csv")
# df_cluster_B = pd.read_csv("df_cluster_B.csv")
# df_cluster_C = pd.read_csv("df_cluster_C.csv")
# df_cluster_D = pd.read_csv("df_cluster_D.csv")

In [None]:
df_cluster_A = df_cluster_A.merge(df_parelheiros, on="timestamp")
df_cluster_B = df_cluster_B.merge(df_parelheiros, on="timestamp")
df_cluster_C = df_cluster_C.merge(df_parelheiros, on="timestamp")
df_cluster_D = df_cluster_D.merge(df_parelheiros, on="timestamp")

In [None]:
def revert_lag(df, cols):
    df[cols] = df[cols].shift(-1)
    return df

In [None]:
def apply_revert_lags(df, cols):
    dfs = []
    for i in df.station_name.unique():
        dfi = df[df.station_name == i]
        dfi = revert_lag(dfi, cols)
        dfs.append(dfi)
    return pd.concat(dfs)

In [None]:
cols = [
    "precipitation_lag_1H",
    "temperature_lag_1H",
    "relative_humidity_lag_1H",
    "pressure_lag_1H",
    "wind_velocity_x_lag_1H",
    "wind_velocity_y_lag_1H",
    "wind_blow_x_lag_1H",
    "wind_blow_y_lag_1H"
    ]


df_cluster_A = apply_revert_lags(df=df_cluster_A, cols=cols)
df_cluster_B = apply_revert_lags(df=df_cluster_B, cols=cols)
df_cluster_C = apply_revert_lags(df=df_cluster_C, cols=cols)
df_cluster_D = apply_revert_lags(df=df_cluster_D, cols=cols)

In [None]:
def calculate_uhii(df):
    df['uhii'] = df.temperature_lag_1H - df.temperature_rural
    return df

In [None]:
df_cluster_A = calculate_uhii(df=df_cluster_A)
df_cluster_B = calculate_uhii(df=df_cluster_B)
df_cluster_C = calculate_uhii(df=df_cluster_C)
df_cluster_D = calculate_uhii(df=df_cluster_D)

In [None]:
df_cluster_A_2 = df_cluster_A[df_cluster_A.uhii >= 0]
df_cluster_B_2 = df_cluster_B[df_cluster_B.uhii >= 0]
df_cluster_C_2 = df_cluster_C[df_cluster_C.uhii >= 0]
df_cluster_D_2 = df_cluster_D[df_cluster_D.uhii >= 0]

df_cluster_A_2['timestamp'] = pd.to_datetime(df_cluster_A_2['timestamp'])
df_cluster_B_2['timestamp'] = pd.to_datetime(df_cluster_B_2['timestamp'])
df_cluster_C_2['timestamp'] = pd.to_datetime(df_cluster_C_2['timestamp'])
df_cluster_D_2['timestamp'] = pd.to_datetime(df_cluster_D_2['timestamp'])

def periodo_dia(x):
  if x >=6 and x < 18:
    return 'Day'
  else:
    return 'Night'

df_cluster_A_2['hour'] = df_cluster_A_2['timestamp'].dt.hour
df_cluster_A_2['day_period'] = df_cluster_A_2['hour'].apply(periodo_dia)

df_cluster_B_2['hour'] = df_cluster_B_2['timestamp'].dt.hour
df_cluster_B_2['day_period'] = df_cluster_B_2['hour'].apply(periodo_dia)

df_cluster_C_2['hour'] = df_cluster_C_2['timestamp'].dt.hour
df_cluster_C_2['day_period'] = df_cluster_C_2['hour'].apply(periodo_dia)

df_cluster_D_2['hour'] = df_cluster_D_2['timestamp'].dt.hour
df_cluster_D_2['day_period'] = df_cluster_D_2['hour'].apply(periodo_dia)

def seasons(x):
  if x in [1, 2, 3]:
    return 'Summer'
  elif x in [4, 5, 6]:
    return 'Autumn'
  elif x in [7, 8, 9]:
    return 'Winter'
  elif x in [10, 11, 12]:
    return 'Spring'

df_cluster_A_2['month'] = df_cluster_A_2['timestamp'].dt.month
df_cluster_A_2['seasons'] = df_cluster_A_2.month.apply(seasons)
df_cluster_A_2['cluster'] = 'A'

df_cluster_B_2['month'] = df_cluster_B_2['timestamp'].dt.month
df_cluster_B_2['seasons'] = df_cluster_B_2.month.apply(seasons)
df_cluster_B_2['cluster'] = 'B'

df_cluster_C_2['month'] = df_cluster_C_2['timestamp'].dt.month
df_cluster_C_2['seasons'] = df_cluster_C_2.month.apply(seasons)
df_cluster_C_2['cluster'] = 'C'

df_cluster_D_2['month'] = df_cluster_D_2['timestamp'].dt.month
df_cluster_D_2['seasons'] = df_cluster_D_2.month.apply(seasons)
df_cluster_D_2['cluster'] = 'D'

df_uhii = pd.concat([
  df_cluster_A_2,
  df_cluster_B_2,
  df_cluster_C_2,
  df_cluster_D_2
])

In [None]:
def group_data(df):
    uhi_hour = df[['seasons', 'hour', 'uhii']]
    uhi_hour_grouped = uhi_hour.groupby(['seasons', 'hour']).agg(['mean', 'std'])
    uhi_hour_grouped = uhi_hour_grouped.reset_index()
    return uhi_hour_grouped

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def uhii_clusters(df):
    mapa = {'Summer': 'Verão', 'Autumn': 'Outono', 'Winter': 'Inverno', 'Spring': 'Primavera'}
    df['seasons'] = df['seasons'].map(mapa)

    fig, axs = plt.subplots(2, 4, figsize=(12,6), sharex='row', sharey='row')
    fig.subplots_adjust(bottom=0.2, hspace=1)

    estacoes = ['Verão', 'Outono', 'Inverno', 'Primavera'] 

    idx = 0
    cluster_A_grouped = group_data(df[df.cluster == 'A'])
    cluster_B_grouped = group_data(df[df.cluster == 'B'])
    cluster_C_grouped = group_data(df[df.cluster == 'C'])
    cluster_D_grouped = group_data(df[df.cluster == 'D'])
    axs[0, idx].set_ylabel('IICU média horária (°C)', size=10)
    for i in estacoes:
        data_slice_cluster_A = cluster_A_grouped[cluster_A_grouped['seasons'] == i]
        data_slice_cluster_B = cluster_B_grouped[cluster_B_grouped['seasons'] == i] 
        data_slice_cluster_C = cluster_C_grouped[cluster_C_grouped['seasons'] == i]
        data_slice_cluster_D = cluster_D_grouped[cluster_D_grouped['seasons'] == i] 
        axs[0, idx].plot(data_slice_cluster_A['hour'], data_slice_cluster_A[('uhii', 'mean')], label='Cluster A', marker='^', color='red', markerfacecolor='none', markeredgecolor='red')
        axs[0, idx].plot(data_slice_cluster_B['hour'], data_slice_cluster_B[('uhii', 'mean')], label='Cluster B', marker='s', color='deepskyblue', markerfacecolor='none', markeredgecolor='deepskyblue')
        axs[0, idx].plot(data_slice_cluster_C['hour'], data_slice_cluster_C[('uhii', 'mean')], label='Cluster C', marker='o', color='lime', markerfacecolor='none', markeredgecolor='lime')
        axs[0, idx].plot(data_slice_cluster_D['hour'], data_slice_cluster_D[('uhii', 'mean')], label='Cluster D', marker='D', color='magenta', markerfacecolor='none', markeredgecolor='magenta')
        axs[0, idx].set_title(i)
        axs[0, idx].grid()
        axs[0, idx].set_xlabel(' ')
        axs[0, idx].legend(fontsize='small')
        idx += 1

    idx = 0
    axs[1, idx].set_ylabel('Densidade', size=10)
    for i in estacoes:  
        sns.kdeplot(df[(df['seasons'] == i) & (df['cluster'] == 'A')]['uhii'], label='Cluster A', color='red', ax=axs[1, idx])
        sns.kdeplot(df[(df['seasons'] == i) & (df['cluster'] == 'B')]['uhii'], label='Cluster B', color='deepskyblue', ax=axs[1, idx])
        sns.kdeplot(df[(df['seasons'] == i) & (df['cluster'] == 'C')]['uhii'], label='Cluster C', color='lime', ax=axs[1, idx])
        sns.kdeplot(df[(df['seasons'] == i) & (df['cluster'] == 'D')]['uhii'], label='Cluster D', color='magenta', ax=axs[1, idx])
        axs[1, idx].grid()
        axs[1, idx].set_xlabel('')
        axs[1, idx].legend(fontsize='small')
        idx += 1

    # Adiciona rótulos de eixo x centralizados
    fig.text(0.5, 0.49, 'Hora', ha='center')
    fig.text(0.5, 0.0, 'IICU (°C)', ha='center')

    plt.tight_layout()
    plt.savefig(f'../figures/uhii_clusters-pt.png', dpi=300, bbox_inches="tight")
    plt.show()


uhii_clusters(df_uhii)

In [None]:
uhi_hour = df_uhii[['cluster', 'seasons', 'hour', 'uhii', 'precipitation_lag_1H', 'temperature_lag_1H', 'relative_humidity_lag_1H']]
uhi_hour = uhi_hour.rename(columns={'precipitation_lag_1H': 'precipitation', 'temperature_lag_1H': 'temperature', 'relative_humidity_lag_1H': 'relative_humidity'})
uhi_hour_grouped = uhi_hour.groupby(['cluster', 'seasons', 'hour'])['uhii'].mean()
uhi_hour_grouped = uhi_hour_grouped.reset_index()

In [None]:
uhi_hour_grouped[(uhi_hour_grouped.cluster == 'A') & (uhi_hour_grouped.seasons == 'Summer')].set_index('cluster')

In [None]:
uhi_hour_grouped[(uhi_hour_grouped.cluster == 'B') & (uhi_hour_grouped.seasons == 'Summer')].set_index('cluster')

In [None]:
uhi_hour_grouped[(uhi_hour_grouped.cluster == 'C') & (uhi_hour_grouped.seasons == 'Summer')].set_index('cluster')

In [None]:
uhi_hour_grouped[(uhi_hour_grouped.cluster == 'D') & (uhi_hour_grouped.seasons == 'Summer')].set_index('cluster')

In [None]:
df_uhii_analysis = df_uhii.rename(columns={"temperature_lag_1H": "temperature", "relative_humidity_lag_1H": "relative_humidity", "precipitation_lag_1H": "precipitation", "station_name": "station"})
df_uhii_analysis = df_uhii_analysis.groupby(['cluster', 'station', 'seasons'])[['temperature', 'relative_humidity', 'precipitation', 'temperature_rural', 'uhii']].mean().reset_index()
df_uhii_analysis = df_uhii_analysis.merge(df_grouped[['cluster', "station", "residencial comercial servicos", "espaco verde urbano"]], on=["cluster", "station"])
df_uhii_analysis = df_uhii_analysis[["cluster", "station", "seasons", "uhii", "temperature", "relative_humidity", "precipitation", "residencial comercial servicos", "espaco verde urbano"]]
df_uhii_analysis.columns = ["cluster", "station", "seasons", "CUHII (°C)", "Temperature (°C)", "Relative humidity (%)", "Precipitation (mm)", "Residential\ncommercial\nservices (%)", "Urban\ngreen\nspace (%)"]

In [None]:
summer = df_uhii_analysis[df_uhii_analysis.seasons == "Summer"].drop(columns=["station", "seasons"])
autumn = df_uhii_analysis[df_uhii_analysis.seasons == "Autumn"].drop(columns=["station", "seasons"])
winter = df_uhii_analysis[df_uhii_analysis.seasons == "Winter"].drop(columns=["station", "seasons"])
spring = df_uhii_analysis[df_uhii_analysis.seasons == "Spring"].drop(columns=["station", "seasons"])

In [None]:
def pairplot(df, name):
    num_vars = df.select_dtypes(include='number').columns

    fig, ax = plt.subplots(len(num_vars), len(num_vars), figsize=(20,18))

    for i in range(len(num_vars)):
        for j in range(len(num_vars)):
            if i == j:
                sns.kdeplot(df[df.cluster == 'A'][num_vars[i]], label='Cluster A', color='red', ax=ax[i, j])
                sns.kdeplot(df[df.cluster == 'B'][num_vars[i]], label='Cluster B', color='deepskyblue', ax=ax[i, j])
                sns.kdeplot(df[df.cluster == 'C'][num_vars[i]], label='Cluster C', color='magenta', ax=ax[i, j])
                sns.kdeplot(df[df.cluster == 'D'][num_vars[i]], label='Cluster D', color='lime', ax=ax[i, j])
                ax[i, j].grid()
            else:
                ax[i, j].scatter(df[df.cluster == 'A'][num_vars[j]], df[df.cluster == 'A'][num_vars[i]], marker='^', label='Cluster A', facecolors='none', edgecolors='red')
                ax[i, j].scatter(df[df.cluster == 'B'][num_vars[j]], df[df.cluster == 'B'][num_vars[i]], marker='s', label='Cluster B', facecolors='none', edgecolors='deepskyblue')
                ax[i, j].scatter(df[df.cluster == 'C'][num_vars[j]], df[df.cluster == 'C'][num_vars[i]], marker='D', label='Cluster C', facecolors='none', edgecolors='magenta')
                ax[i, j].scatter(df[df.cluster == 'D'][num_vars[j]], df[df.cluster == 'D'][num_vars[i]], marker='o', label='Cluster D', facecolors='none', edgecolors='lime')
                
                corr = np.corrcoef(df[num_vars[j]], df[num_vars[i]])[0, 1]
                ax[i, j].text(0.07, 0.9, f'corr={corr:.2f}', transform=ax[i, j].transAxes, size=15)
                ax[i, j].grid()

            if i == 0:
                ax[i, j].set_title(num_vars[j])
            if j == 0:
                ax[i, j].set_ylabel(num_vars[i])

    
    plt.tight_layout()
    plt.savefig(f'../figures/corr_matrix_{name}.png', dpi=300, bbox_inches="tight")
    plt.show()

pairplot(summer, 'summer')
pairplot(autumn, 'autumn')
pairplot(winter, 'winter')
pairplot(spring, 'spring')

In [None]:
summer.drop(columns=['cluster']).corr()

In [None]:
autumn.drop(columns=['cluster']).corr()

In [None]:
winter.drop(columns=['cluster']).corr()

In [None]:
spring.drop(columns=['cluster']).corr()