## Instalação e Preparação do Ambiente

In [1]:
!pip install duckdb
!pip install pandas
!pip install dotenv

Collecting duckdb
  Downloading duckdb-1.4.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (14 kB)
Downloading duckdb-1.4.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (20.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/20.5 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.4.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-

In [19]:
import os
import duckdb
import pandas as pd
import glob

from dotenv import load_dotenv

In [3]:
load_dotenv()

db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

## Extração dos Dados

In [37]:
OPEN_STATUS = 'Aberto'
TOP_N_SUBCATEGORIES = 3
MAX_TICKETS_PER_COMPANY = 110

conn_string = f"dbname={db_name} user={db_user} password={db_password} host={db_host} port={db_port}"

In [38]:
try:
    con = duckdb.connect(database=':memory:')
    
    con.execute("INSTALL postgres;")
    con.execute("LOAD postgres;")
    
    con.execute(f"ATTACH $${conn_string}$$ AS postgres_db (TYPE POSTGRES);")
    
    print("DuckDB -> Postgres connection established successfully")
except Exception as e:
    print(f"Error: {e}")
    exit()

DuckDB -> Postgres connection established successfully


In [39]:
query = f"""
WITH SubcategoryRanks AS (
    SELECT
        co.name AS company_name,
        p.name AS product_name,
        sc.name AS subcategory_name,
        ROW_NUMBER() OVER(PARTITION BY co.name, p.name ORDER BY COUNT(t.ticket_id) DESC) as rank_num
    FROM
        postgres_db.public.tickets AS t
    JOIN postgres_db.public.companies AS co ON t.company_id = co.company_id
    JOIN postgres_db.public.products AS p ON t.product_id = p.product_id
    JOIN postgres_db.public.subcategories AS sc ON t.subcategory_id = sc.subcategory_id
    JOIN postgres_db.public.statuses AS st ON t.current_status_id = st.status_id
    WHERE
        st.name = '{OPEN_STATUS}'
    GROUP BY
        company_name, product_name, subcategory_name
),

CompanyQuota AS (
    SELECT
        company_name,
        FLOOR({MAX_TICKETS_PER_COMPANY} / (COUNT(DISTINCT product_name) * {TOP_N_SUBCATEGORIES})) AS tickets_per_slot_quota
    FROM
        SubcategoryRanks
    WHERE
        rank_num <= {TOP_N_SUBCATEGORIES}
    GROUP BY
        company_name
),

RankedTickets AS (
    SELECT
        co.name AS company_name,
        p.name AS product_name,
        sc.name AS subcategory_name,
        t.title,
        t.description,
        st.name AS status_name,
        t.created_at,
        cq.tickets_per_slot_quota,
        ROW_NUMBER() OVER(PARTITION BY co.name, p.name, sc.name ORDER BY t.created_at DESC) as ticket_rank
    FROM
        postgres_db.public.tickets AS t
    JOIN postgres_db.public.companies AS co ON t.company_id = co.company_id
    JOIN postgres_db.public.products AS p ON t.product_id = p.product_id
    JOIN postgres_db.public.subcategories AS sc ON t.subcategory_id = sc.subcategory_id
    JOIN postgres_db.public.statuses AS st ON t.current_status_id = st.status_id
    JOIN SubcategoryRanks sr ON co.name = sr.company_name AND p.name = sr.product_name AND sc.name = sr.subcategory_name
    JOIN CompanyQuota cq ON co.name = cq.company_name
    WHERE
        sr.rank_num <= {TOP_N_SUBCATEGORIES}
        AND st.name = '{OPEN_STATUS}'
)

SELECT
    company_name,
    product_name,
    subcategory_name,
    title,
    description,
    status_name
FROM
    RankedTickets
WHERE
    ticket_rank <= tickets_per_slot_quota;
"""

In [40]:
try:
    df_chamados = con.execute(query).fetchdf()
    print(f"Total of {len(df_chamados)} tickets found")

    if df_chamados.empty:
        print("No tickets found")
    else:
        companhias = df_chamados['company_name'].unique()

        output_dir = "../data/original_tickets"
        os.makedirs(output_dir, exist_ok=True)

        for comp in companhias:
            df_companhia = df_chamados[df_chamados['company_name'] == comp]
            
            safe_filename = str(comp).lower().replace(' ', '_').replace('/', '_') + ".csv"
            output_path = os.path.join(output_dir, safe_filename)
            
            df_companhia.to_csv(output_path, index=False, encoding='utf-8-sig')

except Exception as e:
    print(f"Error: {e}")
finally:
    con.close()

Total of 1080 tickets found


## Verificação dos Dados

In [41]:
input_dir = "../data/original_tickets"

csv_pattern = os.path.join(input_dir, '*.csv')
csv_files = glob.glob(csv_pattern)

if not csv_files:
    exit()

all_dataframes = []

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        all_dataframes.append(df)
    except Exception as e:
        print(f"Error: {file_path}: {e}")

if not all_dataframes:
    exit()

master_df = pd.concat(all_dataframes, ignore_index=True)

required_columns = ['company_name', 'product_name', 'subcategory_name']
if not all(col in master_df.columns for col in required_columns):
    exit()

summary = master_df.groupby(['company_name', 'product_name', 'subcategory_name']).size().reset_index(name='total_chamados')


print("\n--- Ticket Report by Customer, Product, and Subcategory ---")
print(summary.to_string())

output_report_path = '../data/report_tickets/grouped_report_v3.csv'
try:
    summary.to_csv(output_report_path, index=False, encoding='utf-8-sig')
except Exception as e:
    print(f"\nError: {e}")


--- Ticket Report by Customer, Product, and Subcategory ---
              company_name                                         product_name             subcategory_name  total_chamados
0    Barbosa Monteiro S.A.         Coremind (Sistema de gestão de conhecimento)         Cadastro de usuários               6
1    Barbosa Monteiro S.A.         Coremind (Sistema de gestão de conhecimento)                   Exportação               6
2    Barbosa Monteiro S.A.         Coremind (Sistema de gestão de conhecimento)                   Relatórios               6
3    Barbosa Monteiro S.A.           Guizo (Sistema de atendimento de clientes)         Cadastro de usuários               6
4    Barbosa Monteiro S.A.           Guizo (Sistema de atendimento de clientes)              Erro de sistema               6
5    Barbosa Monteiro S.A.           Guizo (Sistema de atendimento de clientes)  Funcionalidade indisponível               6
6    Barbosa Monteiro S.A.  PVP (Sistema de remuneração variável