In [17]:
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPEN_AI_KEY)

In [18]:
def get_query_embeddings(query):
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    return query_embedding

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [19]:
files = ['aetna.txt',
         'anthem_blue_cross.txt',
         'blue_shield.txt',
         'bronze_option.txt',
         'cchp.txt',
         'deadlines.txt',
         'family_options.txt',
         'gold_option.txt',
         'government_discounts.txt',
         'health_net.txt',
         'hmo_vs_ppo.txt',
         'iehp.txt',
         'income_limits.txt',
         'individual_options.txt',
         'irs_1095_a_form.txt',
         'kaiser.txt',
         'la_care_health_plan.txt',
         'medi_cal_options.txt',
         'minimum_option.txt',
         'molina_health',
         'newborn_options.txt',
         'open_enrollment.txt',
         'platinum_option.txt',
         'preventative_care.txt',
         'qualifying_life_events.txt',
         'reporting_changes.txt',
         'self_employed_options.txt',
         'senior_options.txt',
         'sharp.txt',
         'should_switch_to_hmo.txt',
         'silver_70_option.txt',
         'silver_73_option.txt',
         'silver_87_option.txt',
         'silver_94_option.txt',
         'silver_option.txt',
         'small_business_options.txt',
         'special_enrollment.txt',
         'supplemental_options.txt',
         'travel_options.txt',
         'valley_health_plan.txt',
         'western_health_plan.txt'
         ]

In [20]:
file_contents = []
for file in files:
    with open(f"../../datasets/covered_california_2024/{file}", 'r') as file:
        content = file.read()
        file_contents.append(content)
file_contents

['Aetna Insurance\nAetna is a health insurance provider owned by CVS Health. It’s one of the oldest and largest health insurance companies in the country, with a history dating back to the 1800s. Aetna got its start selling life insurance. Today, it also offers various medical and dental plans nationwide.\n\nAetna wasn’t always available in California. In 2018, the California Department of Health Care Services (DHCS) approved Aetna joining Medi-Cal with its Better Health of California plan, offered in San Diego and Sacramento counties. Then, in 2023, Aetna CVS Health joined Covered California to offer individual and family health insurance.\n\nWhat Is Aetna Known For?\nPeople choose Aetna for its vast network of providers, competitive rates and long history in the insurance industry. It also offers benefits like access to 24/7 virtual care at no or low costs, depending on your plan, and mental health coverage. Aetna is regarded as a good health insurance provider overall.\n\nHow Do You

In [21]:
tokens = 0
for content in file_contents:
    tokens += num_tokens(content, EMBEDDING_MODEL)
tokens

37348

In [22]:
embeddings = []
for content in file_contents:
    # embeddings.append(get_query_embeddings(content))
    continue

In [23]:
data = {
    'text': file_contents,
    'embeddings': embeddings
}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)

ValueError: All arrays must be of the same length

In [24]:
len(embeddings)

0