In [2]:
import numpy as np
import pandas as pd
from openai import AzureOpenAI
import json
from dotenv import load_dotenv
import os

In [3]:
azure_openai_api_version = os.getenv('AZURE_OPENAI_API_VERSION')
azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
azure_openai_key = os.getenv('AZURE_OPENAI_KEY')

client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

In [5]:
# Cargar el archivo JSON
file_path = 'json_ech.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extraer la información de las variables
variables_info = []
for variable in data['codeBook']['dataDscr']['var']:
    # ID para cada pregunta
    ID = variable.get("_ID")

    # nombre de las variables
    var_name = variable.get('_name', 'N/A')

    # Verificar si existe la pregunta y manejar los diferentes tipos (str o dict)
    var_question = variable.get('qstn', {}).get('qstnLit', 'No question text')
    if isinstance(var_question, dict):  # Si es un diccionario, intentamos extraer el texto
        var_question = var_question.get('__cdata', 'No question text')

    # Guardar la información de la variable
    var_info = {
        'ID': ID,
        'Variable Name': var_name,
        'Question': var_question
    }
    variables_info.append(var_info)

# Extraemos las preguntas/variables
Question = [item['Question'] for item in variables_info]

# Generamos embeddings para las preguntas/variables
content_response = client.embeddings.create(
    input=Question,
    model="text-embedding-3-small",
    dimensions=1536
)
content_embeddings = [item.embedding for item in content_response.data]

# Asignamos los embeddings a cada elemento
for i, item in enumerate(variables_info):
    item['Question_vector'] = content_embeddings[i]

with open('variables_ech_emb.json', 'w', encoding='utf-8') as outfile:
    json.dump(variables_info, outfile, ensure_ascii=False, indent=4)