<a href="https://colab.research.google.com/github/mateus-miguel/case-dadosfera/blob/main/ETL%202%20-%20Descoberta%20de%20Atributos%20(LLM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install boto3

In [8]:
import os
import boto3
import json
import random

# Chaves de acesso

# Geramos um cliente S3 após definir as chaves de acesso do user IAM
s3 = boto3.client('s3')

# Requisitamos o arquivo JSON de produtos/schema do bucket S3 desejado, onde type(obj) = dict
bucket = 'dadosfera-products-raw'
file_json = 'products_clean_1000rows.json'

# Abrindo o arquivo JSON a ser utilizado para descobrir atributos no LLM
obj = s3.get_object(
    Bucket=bucket,
    Key=file_json
)

data = json.loads(obj["Body"].read())

# Abrindo arquivo sem usar S3, arquivo local no Colab
# with open(file_json, 'r', encoding='utf-8') as f:
#   data = json.load(f)

# 1. Amostragem aleatória de k produtos entre o total do arquivo (p.ex. k = 100 entre 1000 linhas de produtos)
sample = random.sample(data, k=1000)

print(sample[0])

{'docid': 1657, 'title': "Men's 3/4 Comression Pants Running Tights Sport Basketball Active Leggings 2 Pack", 'text': "Product Description Men's Compression 3/4 Pants for Sports Pants Features The material is 84% Polyester& 16% Spandex 2 peices of pants in one package for you to own High compression fit for support and reduced muscle vibration when doing sports Quick-dry, breathable fabric wicks sweat away keeps you stay cool and comfort 4-Way-Stretch Compression Pants offer greater range of movement with elastic waistband provides snug fit Suitable for many active sports Basketball Football Running Cycling "}


In [9]:
# 2. Função que gera o prompt baseado num sample aleatório de tamanho k produtos do arquivo JSON
def build_prompt(products):
  examples = []
  for i, p in enumerate(products, 1):
    examples.append(
        f"{i}. Title: {p['title']}\n"
        f"Description: {p['text']}"
    )

    return f"""
      You are analyzing a small dataset of product listings.

      Your task:
      - Propose a compact set of reusable general product attributes.
      - Attributes must be applicable to MANY products (clothing, electronics, beauty, house, etc).
      - Always use as initial attributes: category (clothing, electronics etc), material (str), color (list), size (list).
      - Use enum (NOT list) and boolean.
      - Avoid rare, cosmetic, or overly specific details.
      - Do NOT include niche-specific attributes.
      - Do NOT extract per-product values.

      Only suggest attributes that:
      - Apply to at least 30–40% of products across multiple unrelated categories
      - Have HIGH frequency across the dataset.
      - Are NOT specific to cosmetics, electronics, clothing, skincare, beauty, or personal care
      - Can be reused across an entire e-commerce catalog

      For each suggested attribute, provide:
      - attribute_name (snake_case)
      - attribute_type (boolean | enum | string)
      - short description
      - estimated_frequency (high / medium / low)
      - Don't use `` for the values

      After giving all attributes, give all the product categories found and for the first product make a JSON format with the keys being the found attributes and the values selected from the product title/text.

      Dataset:
      {chr(10).join(examples)}
    """

In [None]:
from openai import OpenAI

# 3. Construir prompt
prompt = build_prompt(sample)

# 4. Criar client OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

# 5. Chamar o modelo
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0.2
)

# 6. Ler resposta
suggestions = response.choices[0].message.content
print(suggestions)

In [None]:
# 7. Com base na resposta, definir um schema.json com atributos de alta/média frequência
import re

# Regex para capturar attribute_name e attribute_type
pattern = r"\*\*attribute_name\*\*:\s*(\w+)[\s\S]*?\*\*attribute_type\*\*:\s*(\w+)[\s\S]*?\*\*short_description\*\*:\s*([^\n\r]+)"

text = suggestions
matches = re.findall(pattern, text) # lista de tuples [(,), (,), ...]
print(matches)

# Limpando espaços finais nas descrições com .rstrip()
matches_clean = []

for match in matches:
  match = list(match)
  match[2] = match[2].rstrip()
  match = tuple(match)
  matches_clean.append(match)

# Mapeando a forma dos tipos na saída da LLM para formato de dados no Python
type_mapping = {
    "boolean": "bool",
    "string": "str",
    "enum": "list",
    "list": "list"
}

# Dicionário do schema a ser criado
schema = {
    'attributes': [
        {
            'name': 'docid',
            'type': 'int',
            'description': 'document ID for product'
        },
        {
            'name': 'title',
            'type': 'str',
            'description': 'Name of product'
        },
        {
            'name': 'text',
            'type': 'str',
            'description': 'Description for product'
        }
    ]
}

# Após achar os matches do Regex na saída do LLM, os atributos, seus tipos e descrições são colocados no schema
for name, attr_type, description in matches_clean:
  schema['attributes'].append({
      'name': name,
      'type': type_mapping.get(attr_type.lower(), str),
      'description': description
  })

# Guardando schema.json no bucket S3
s3.put_object(
    Bucket=bucket,
    Key='schema_v2.json',
    Body=json.dumps(schema, ensure_ascii=False).encode('utf-8')
)

# Output do schema como JSON localmente no Colab
# with open('schema.json', 'w', encoding='utf-8') as f:
#   json.dump(schema, f, ensure_ascii=False, indent=2)