<a href="https://colab.research.google.com/github/mateus-miguel/case-dadosfera/blob/main/ETL%203%20-%20Enriquecimento%20Sem%C3%A2ntico%20(LLM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install boto3

In [3]:
import os
import json
import boto3

# Chaves de acesso AWS


# Geramos um cliente S3 após definir as chaves de acesso do user IAM
s3 = boto3.client('s3')

# Requisitamos o arquivo JSON de produtos/schema do bucket S3 desejado, onde type(obj) = dict
bucket = 'dadosfera-products-raw'
file_json = 'products_clean_1000rows.json'
schema_json = 'schema.json'

obj_product = s3.get_object(
    Bucket=bucket,
    Key=file_json
)

obj_schema = s3.get_object(
    Bucket=bucket,
    Key=schema_json
)

# Vasta maioria das chaves são irrelevantes, o que importa é 'Body', que deve ser um pointer, deve ser usado .read()
# 'Body': <botocore.response.StreamingBody object at 0x7dadab1bd1e0>

products = json.loads(obj_product["Body"].read())
schema = json.loads(obj_schema["Body"].read())
print(len(products))
print(schema['attributes'][0])

1000
{'name': 'docid', 'type': 'int', 'description': 'document ID for product'}


In [4]:
from tqdm import tqdm
from openai import OpenAI

# Chave de acesso API OpenAI

# Criar client OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

# Forma alternativa de abrir arquivos schema e products se estiverem presents no Google Colab, não puxados do S3
# with open('schema.json', 'r', encoding='utf-8') as f:
#   schema = json.load(f)

# with open("products_txt_clean.json", "r", encoding="utf-8") as f:
#   products = json.load(f)

# print(products)

In [5]:
def enrich_product(product, schema):
  # Essa função usa um prompt com o schema e o detalhe inicial do produto para obter uma resposta do modelo gpt-4o-mini,
  # extraindo do nome/descrição do produto as informações para os atributos do schema fixo, devolvendo o JSON dos novos atributos preenchidos.

  prompt = f"""
  SCHEMA:
  {json.dumps(schema, ensure_ascii=False, indent=2)}

  PRODUCT:
  docid: {product['docid']}
  title: {product['title']}
  description: {product['text']}

  Return a JSON object containing ONLY the attributes defined in the schema.
  """

  response = client.chat.completions.create(
      model="gpt-4o-mini",
      temperature=0, # no creativity needed, just extraction
      messages=[
          {
              'role': 'system',
              'content': (
                  "You are an information extraction system.\n"
                  "Use only the schema attributes.\n"
                  "Do not hallucinate.\n"
                  "Infer the main category from the product title/description"
                  "If an attribute cannot be inferred, return null (empty string, empty list). Don't use 'unknown', 'not specified' or ['none'] for a list"
                  "Return valid JSON only. No 'json' before, starting at {}"
              )
          },
          {'role': 'user', 'content': prompt}
      ]
  )

  # print(response.choices[0].message.content)

  return json.loads(response.choices[0].message.content)

In [None]:
# --- BLOCO DE TESTE ---

resultado = enrich_product(products[2], schema)

enriched_product = {
    **products[2], # docid, title, text
    **resultado # novos atributos
}

print(enriched_product)

with open('products_enriched.json', 'w', encoding='utf-8') as f:
  json.dump([enriched_product], f, ensure_ascii=False, indent=2)

{'docid': 4, 'title': 'YUEPIN U-Tube Clamp 304 Stainless Steel Hose Pipe Cable Strap Clips With Rubber Cushioned (1-21/32"(42mm)-10pcs)', 'text': 'Product Description Specification: Material: 304 Stainless Steel,100% New Rubber Color: Silver Shape: U Shape Quantity: 10 Pieces Note: Note: Since the size above is measured by hand, the size of the actual item you received could be slightly different from the size above. Product Description Specification: Material: 304 Stainless Steel,100% New Rubber Color: Silver Shape: U Shape Quantity: 10 Pieces Note: Note: Since the size above is measured by hand, the size of the actual item you received could be slightly different from the size above.', 'category': 'home goods', 'material': '304 Stainless Steel, Rubber', 'brand': 'YUEPIN', 'color': ['Silver'], 'size': [], 'rechargeable': False, 'waterproof': False, 'warranty': [], 'origin_country': '', 'features': ['U Shape', '10 Pieces']}


In [6]:
# --- ENRIQUECENDO OS PRODUTOS COM ATRIBUTOS DO SCHEMA ---

enriched_products = []

# Loop usando tqdm para ver o progresso total, com try/except para lidar com possíveis erros do LLM e tratamento de json
for product in tqdm(products):
  try:
    attributes = enrich_product(product, schema)

    enriched_product = {
        **product,    # antigos atributos
        **attributes  # novos atributos
    }

    enriched_products.append(enriched_product)

  except Exception as e:
    print(f'Erro no produto {product['docid']}: {e}')

# Salvando o arquivo enriquecido gerado no bucket S3
s3.put_object(
    Bucket=bucket,
    Key=file_json.replace('_clean_', '_enriched_'),
    Body=json.dumps(enriched_products, ensure_ascii=False).encode("utf-8")
)

# Criando novo arquivo JSON enriquecido com atributos extraídos pelo LLM
# with open('products_enriched_50.json', 'w', encoding='utf-8') as f:
#   json.dump(enriched_products, f, ensure_ascii=False, indent=2)

 24%|██▍       | 238/1000 [45:53<3:49:36, 18.08s/it]

Erro no produto 435: Expecting ',' delimiter: line 4 column 4402 (char 4633)


 38%|███▊      | 381/1000 [1:09:26<1:51:14, 10.78s/it]

Erro no produto 691: Expecting property name enclosed in double quotes: line 4 column 2371 (char 2515)


 73%|███████▎  | 734/1000 [2:10:42<1:08:23, 15.43s/it]

Erro no produto 1360: Expecting ',' delimiter: line 4 column 1035 (char 1085)


 78%|███████▊  | 776/1000 [2:22:58<6:10:52, 99.34s/it]

Erro no produto 1442: Unterminated string starting at: line 4 column 11 (char 178)


 90%|█████████ | 900/1000 [2:43:27<18:34, 11.15s/it]

Erro no produto 1674: Expecting ',' delimiter: line 4 column 1090 (char 1323)


 94%|█████████▎| 936/1000 [2:49:35<09:39,  9.05s/it]

Erro no produto 1729: Expecting ',' delimiter: line 4 column 1019 (char 1210)


100%|██████████| 1000/1000 [3:00:16<00:00, 10.82s/it]


{'ResponseMetadata': {'RequestId': 'WE05DFQRHA84880W',
  'HostId': 'F7ZosKZqiGT8lpCyKwvcLX96fipk0IqGyabmHIbJujQlFqQOXnlClfe5jzVE+Lh/GIM9zLc37+luicQlMPvtmFAYuVA5ba6V',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'F7ZosKZqiGT8lpCyKwvcLX96fipk0IqGyabmHIbJujQlFqQOXnlClfe5jzVE+Lh/GIM9zLc37+luicQlMPvtmFAYuVA5ba6V',
   'x-amz-request-id': 'WE05DFQRHA84880W',
   'date': 'Thu, 15 Jan 2026 22:05:31 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"27e055f196d7457f7e83785f4438f844"',
   'x-amz-checksum-crc32': '1IlfqA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"27e055f196d7457f7e83785f4438f844"',
 'ChecksumCRC32': '1IlfqA==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}