In [55]:
import pandas as pd
import mlxtend as ml
import numpy as np
import sklearn as sk
import sqlite3
import re
import math
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

pd.set_option("display.max_columns", None)  # Mostrar todas as colunas
pd.set_option("display.max_rows", None)     # Mostrar todas as linhas
pd.set_option("display.width", None)        # Ajustar a largura do display para não truncar

In [None]:
# Código para pré-processamento e transformação inicial
"""
dropped_columns = [
    "ID_UC",
    "Categoria de Manejo",
    "Nome da UC",
    "Informações Gerais",
    "Outros atos legais",
    "Plano de Manejo",
    "Conselho Gestor",
    "Órgão Gestor",
    "Fonte da Área: (1 = SHP, 0 = Ato legal)",
    "Bioma Área (ha)",
    "Amazônia",
    "Caatinga",
    "Cerrado",
    "Mata Atlântica",
    "Pampa",
    "Pantanal",
    "Área Marinha",
    "Bioma declarado",
    "% Além da linha de costa",
    "Grupo",
    "PI",
    "US",
    "Recortes (ha)",
    "Lei da Mata Atlântica",
    "Mar Territorial",
    "Município Costeiro",
    "Município Costeiro + Área Marinha",
    "Amazônia Legal",
    "Programa/Projeto",
    "Sítios do Patrimônio Mundial",
    "Sítios Ramsar",
    "Mosaico",
    "Reserva da Biosfera",
    "Sobreposição com TI ou TQ",
    "Biomas Abrangidos",
    "Região",
    "Presente na versão anterior",
    "Diferença Área",
    "Razão Diferença Área",
    "Qualidade dos dados georreferenciados",
    "Data da última certificação dos dados pelo Órgão Gestor",
    "Categoria IUCN",
    "Esfera Administrativa",
    "Ano do ato legal mais recente",
    "Área Ato Legal de Criação"
]

UC_DATASET_DIR = "/content/drive/MyDrive/IA/ucs_filtrado_2.csv"
PIB_DATASET_DIR = "/content/drive/MyDrive/IA/PIB_por_ano.csv"
POPULACAO_DATASET_DIR = "/content/drive/MyDrive/IA/estimativa_populacao.csv"
SALARIOS_DATASET_DIR = "/content/drive/MyDrive/IA/massa_salarial_por_ano.csv"
EMPREGOS_DATASET_DIR = "/content/drive/MyDrive/IA/total_de_empregados_por_ano.csv"
TOTAL_EMPRESAS_DATASET_DIR = "/content/drive/MyDrive/IA/total_empresas.csv"
MUNICIPIOS_DATASET_DIR = "/content/drive/MyDrive/IA/municipios_ordenado_por_uc.csv"

def extract_year(texto):
    # Regex para encontrar o ano no formato yyyy
    # Algumas datas precisaram ser ajustadas manualmente
    match = re.search(r'\b(\d{4})\b', texto)
    if match:
        return match.group(1)
    return None

def normalize_area(value):
  try:
    value = value.split(',')[0]
    value = str(value).replace('.', '')
  except:
    return value
  return int(value)

# gerar tabela com a quantidade de ucs por municipio e área
df = pd.read_csv(UC_DATASET_DIR, sep=',')
conn = sqlite3.connect("database.db")
df.to_sql("uc_table", conn, if_exists="replace", index=False)
df = pd.read_sql_query("SELECT municipio, count(*), SUM(area_soma) FROM uc_table group by municipio  order by municipio " , conn)
conn.close()
df.to_csv('municipio_qtdUC_area.csv', index=False)

# gerar tabela de municipios ordenados por suas respectivas quantidades de UC
data = pd.read_csv(UC_DATASET_DIR, sep=',')
df = pd.DataFrame(data)
df = df.groupby('municipio').size().reset_index(name='uc_count')
df = df.sort_values(by='uc_count', ascending=False)
df.to_csv('municipios_ordenado_por_uc.csv', index=False)

#data = pd.read_csv(UC_DATASET_DIR, sep=';')
df = pd.DataFrame(data)
df = df[df["UF"] == "SC"]
df = df.drop(columns=["UF"])
for col in dropped_columns:
  df = df.drop(columns=[col])
df["Ato Legal de Criação"] = df["Ato Legal de Criação"].apply(extract_year)
df["Área soma biomas"] = df["Área soma biomas"].apply(normalize_area)
df["Área soma Biomas Continental"] = df["Área soma Biomas Continental"].apply(normalize_area)

"""

In [56]:
ASSETS_DIR = "./assets"

WAREHOUSE_FILE = f"{ASSETS_DIR}/data_warehouse_2.csv"

pd.set_option("display.max_columns", None)  # Mostrar todas as colunas
pd.set_option("display.width", None)        # Ajustar a largura do display para não truncar

In [57]:
main_df = pd.read_csv(WAREHOUSE_FILE, index_col=False)

In [58]:
# main_df[["municipios", "area_coberta"]]
total_area = main_df["area_coberta"].sum()
main_df["area_coberta %"] = main_df["area_coberta"] / total_area

year_employenment_growth = main_df["empregados_2020"]     - main_df["empregados_2010"]
year_population_growth   = main_df["populacao_2020"]      - main_df["populacao_2010"]
year_gdp_growth          = main_df["pib_municipio_2020"]  - main_df["pib_municipio_2010"]
year_salary_delta        = main_df["massa_salarial_2020"] - main_df["massa_salarial_2010"]

main_df["delta_empregados"]= year_employenment_growth
main_df["delta_populacao"] = year_population_growth
main_df["delta_pib"]       = year_gdp_growth
main_df["delta_massa_salarial"]   = year_salary_delta

# main_df.sort_values(["taxa_de_populacao (2010-2020)", "taxa_de_empregados (2010-2020)", "area_coberta"], ascending=False)

In [59]:
total = len(main_df)

main_df = main_df.sort_values(by='area_coberta %', ascending=False)

def classification_thresholds(col) -> dict:
    classes = (
        ('low', 0.20),
        ('moderate', 0.50),
        ('good', 0.75),
        ('high', 0.95),
    )

    out = {
        'none': 0,
    }

    for key, val in classes:
        out[key] = col.quantile(val)

    return out

def classify_by_quantile(df, columns: list[str]):
    for col in columns:
        new_col = f'{col}_class'
        df[new_col] = 'none'
        classification = classification_thresholds(df[col])

        for i, row in df.iterrows():
            for label, threshold in classification.items():
                if row[col] >= threshold:
                    df.at[i, new_col] = label
    return df


filtered_df = main_df.sort_values('area_coberta', ascending=False).copy()

filtered_df = classify_by_quantile(filtered_df, ['area_coberta', 'delta_populacao', 'delta_empregados', 'delta_pib', 'delta_massa_salarial'])

filtered_df

Unnamed: 0,municipios,quantidade_ucs,area_coberta,populacao_2010,populacao_2011,populacao_2012,populacao_2013,populacao_2014,populacao_2015,populacao_2016,populacao_2017,populacao_2018,populacao_2019,populacao_2020,massa_salarial_2010,massa_salarial_2011,massa_salarial_2012,massa_salarial_2013,massa_salarial_2014,massa_salarial_2015,massa_salarial_2016,massa_salarial_2017,massa_salarial_2018,massa_salarial_2019,massa_salarial_2020,pib_municipio_2010,pib_municipio_2011,pib_municipio_2012,pib_municipio_2013,pib_municipio_2014,pib_municipio_2015,pib_municipio_2016,pib_municipio_2017,pib_municipio_2018,pib_municipio_2019,pib_municipio_2020,empregados_2010,empregados_2011,empregados_2012,empregados_2013,empregados_2014,empregados_2015,empregados_2016,empregados_2017,empregados_2018,empregados_2019,empregados_2020,estabelecimentos,area_coberta %,delta_empregados,delta_populacao,delta_pib,delta_massa_salarial,area_coberta_class,delta_populacao_class,delta_empregados_class,delta_pib_class,delta_massa_salarial_class
88,FLORIANÓPOLIS,20,279607,427298,427298,433158,453285,461524,469690,477798,485838,492977,500973,508826,1.606721e+08,1.921229e+08,2.230646e+08,2.524728e+08,2.866640e+08,3.025580e+08,3.193570e+08,3.328831e+08,3.641840e+08,3.740900e+08,3.570488e+08,11276680000,12731618000,13946621000,14974993000,16915926000,17619984000,18660876000,19516694000,21059561000,21963928000,21312447000,127467,136037,141972,147047,153304,149780,146759,142896,147455,154647,147935,148,0.091004,20468,81528,10035767000,1.963767e+08,high,high,high,high,high
189,PAULO LOPES,6,244407,6751,6751,6808,7045,7124,7203,7282,7360,7418,7494,7569,1.375521e+06,1.523021e+06,1.616975e+06,1.979181e+06,2.275203e+06,2.328851e+06,2.430322e+06,2.566049e+06,2.532010e+06,2.628638e+06,2.685109e+06,130616200,140022600,158650100,175388300,196747700,166876600,173173000,181154500,190304200,200456000,227628000,1212,1225,1215,1367,1440,1403,1323,1317,1288,1265,1292,721,0.079547,80,818,97011800,1.309588e+06,high,moderate,low,low,low
181,PALHOÇA,4,244108,139990,139990,142558,150623,154244,157833,161395,164926,168259,171797,175272,2.744984e+07,3.396230e+07,4.032309e+07,4.648295e+07,5.446085e+07,5.869772e+07,6.263909e+07,6.739491e+07,7.626485e+07,7.863092e+07,8.277773e+07,2685689000,3133473000,3846978000,3990380000,4694889000,5104686000,4375512000,4834031000,5489676000,5985390000,6530836000,26622,28726,30927,32041,33701,32763,31762,32329,35785,37545,40187,1954,0.079450,13565,35282,3845147000,5.532789e+07,high,high,high,high,high
94,GAROPABA,2,160199,18520,18520,18890,20024,20545,21061,21573,22082,22568,23078,23579,3.040572e+06,3.609895e+06,4.542455e+06,5.489691e+06,6.180405e+06,6.713530e+06,7.419670e+06,8.191510e+06,8.875684e+06,9.274646e+06,9.343595e+06,231533900,261002600,303007000,346584200,405903100,439794700,478206100,534236700,584882600,617279000,684966000,3068,3261,3482,3685,3892,4021,4234,4526,4818,4952,5081,1532,0.052140,2013,5059,453432100,6.303022e+06,high,good,good,good,good
142,LAGUNA,1,154859,51869,51869,42750,43979,44316,44650,44982,45311,45500,45814,46122,4.143784e+06,4.835850e+06,6.409407e+06,8.714942e+06,9.298556e+06,7.614426e+06,7.991097e+06,8.301916e+06,8.208387e+06,7.287568e+06,7.378819e+06,474495200,526835800,686312200,668180300,748635900,757677900,784301600,841741800,900499100,891933000,962895000,4698,4989,5647,6453,6404,5319,5280,5249,5052,4556,4731,782,0.050402,33,-5747,488399800,3.235035e+06,high,none,low,good,moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,WITMARSUM,0,0,3627,3627,3653,3769,3805,3841,3876,3912,3932,3965,3998,6.774754e+05,7.555278e+05,9.672198e+05,1.212159e+06,1.532836e+06,1.600320e+06,1.790592e+06,1.888695e+06,2.069809e+06,2.206657e+06,2.156703e+06,66342250,67950760,72938520,84680440,95820720,95988600,108866000,120885500,129232600,131465000,143671000,705,733,818,873,1022,1005,988,1014,1040,1054,1063,297,0.000000,358,371,77328750,1.479227e+06,moderate,low,moderate,low,low
290,XANXERÊ,0,0,44643,44643,45140,46981,47679,48370,49057,49738,50309,50982,51642,1.153632e+07,1.318686e+07,1.527735e+07,1.630347e+07,1.931290e+07,2.096599e+07,2.332148e+07,2.390352e+07,2.473337e+07,2.551678e+07,2.637937e+07,1041491000,1242075000,1318268000,1319706000,1477791000,1458213000,1530245000,1604050000,1703710000,1896873000,2185610000,10389,10776,11218,11025,11636,11804,12045,11826,11703,12004,11884,769,0.000000,1495,6999,1144119000,1.484305e+07,moderate,good,good,good,good
291,XAVANTINA,0,0,4122,4122,4103,4124,4095,4067,4039,4012,3963,3933,3903,2.234044e+05,3.141459e+05,2.932830e+05,3.764854e+05,4.559842e+05,5.223479e+05,5.380001e+05,5.266712e+05,4.792234e+05,5.654537e+05,3.430386e+05,91944330,97463230,92486740,112955900,141825300,171769900,185723600,183022300,216185400,157893000,175583000,261,342,275,300,327,339,317,305,278,314,189,305,0.000000,-72,-219,83638670,1.196342e+05,moderate,none,none,low,none
292,XAXIM,0,0,25933,25933,26145,27039,27336,27630,27921,28210,28424,28706,28983,6.778227e+06,7.906581e+06,7.192216e+06,9.334058e+06,1.205812e+07,1.345110e+07,1.486332e+07,1.641997e+07,1.769857e+07,1.907906e+07,2.076566e+07,579348900,594177600,595003400,778413800,779071100,812945200,845678200,927591700,947824800,1038423000,1108167000,7314,7737,6623,7295,8327,8363,8531,8608,9129,9381,10109,916,0.000000,2795,3050,528818100,1.398744e+07,moderate,good,good,good,good


In [60]:
keep = [
    'municipios', 'quantidade_ucs', 'estabelecimentos',
    'area_coberta',       'delta_pib',       'delta_populacao',       'delta_massa_salarial',
    'area_coberta_class', 'delta_pib_class', 'delta_populacao_class', 'delta_massa_salarial_class',
]

out_df = pd.DataFrame()
for col in keep:
    out_df[col] = filtered_df[col].copy()

out_df.to_csv('data_warehouse_2_discretized.csv', index=False)

In [66]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

store = pd.read_csv('data_warehouse_2_discretized.csv')

dropped_columns = ['delta_populacao_class', 'delta_massa_salarial_class', 'municipios', 'quantidade_ucs', 'estabelecimentos', 'area_coberta', 'delta_pib', 'delta_populacao', 'delta_massa_salarial']
store = store.drop(columns=dropped_columns)

print(store.head())

store = store[store['area_coberta_class'] != 'none']

store_data = store.values.tolist()

te = TransactionEncoder()
te_ary = te.fit_transform(store_data)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

association_rules_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

# Exibe os resultados
print("\nConjuntos frequentes:")
print(frequent_itemsets)
print("\nRegras de associação:")
print(association_rules_df)


  area_coberta_class delta_pib_class
0               high            high
1               high             low
2               high            high
3               high            good
4               high            good

Conjuntos frequentes:
    support         itemsets
0  0.343537           (good)
1  0.299320            (low)
2  0.806122       (moderate)
3  0.200680           (none)
4  0.248299  (moderate, low)

Regras de associação:
  antecedents consequents  antecedent support  consequent support   support  \
0       (low)  (moderate)             0.29932            0.806122  0.248299   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.829545  1.029056  0.007011    1.137415       0.040298  
