In [28]:
import json
import openai
import os
import tiktoken
from scipy import spatial  # for calculating vector similarities for search

In [None]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [11]:
with open('pid_abstract_final.json') as f:
    pid_abstrate = json.load(f)

with open('pid_title.json') as f:
    pid_title = json.load(f)

In [6]:
pid_content_relate_label = {}
with open('topic02.csv') as f:
    lines = f.read().splitlines()

for line in lines:
    pid, label = line.split(',')
    pid_content_relate_label[pid] = int(label)

In [7]:
len([pid for pid, label in pid_content_relate_label.items() if label])

116

In [8]:
pids = list(set([pid for pid, label in pid_content_relate_label.items() if label]))

In [9]:
len(pids)

116

In [13]:
# embedding model test
response = openai.Embedding.create(
    input="This is test",
    engine="text-embedding-ada-002-test",
)

In [15]:
embeddings = response['data'][0]['embedding']

In [16]:
len(embeddings) # embedding length: 1536

1536

In [23]:
# count token for each text
tokenizer = tiktoken.encoding_for_model('text-embedding-ada-002')
pid_token_count = {}
for pid in pids:
    text = f'Title: {pid_title[pid]}\nAbstract: {pid_abstrate[pid]}'
    tokens = tokenizer.encode(text)
    pid_token_count[pid] = len(tokens)

print('total token count:', sum(pid_token_count.values()))
# total token count: 37965
# API price: $0.0001 / 1K tokens
# cost: $0.0038

total token count: 37965


In [24]:
# get embeddings
if os.path.exists('abstrate_embeddings.json'):
    with open('abstrate_embeddings.json') as f:
        abstract_embeddings = json.load(f)
else:
    abstract_embeddings = {}

for idx, pid in enumerate(pids):
    if pid in abstract_embeddings:
        continue
    text = f'Title: {pid_title[pid]}\nAbstract: {pid_abstrate[pid]}'
    try:
        response = openai.Embedding.create(
            input=text,
            engine="text-embedding-ada-002-test",
        )
        embeddings = response['data'][0]['embedding']
        assert len(embeddings) == 1536
        abstract_embeddings[pid] = embeddings
        print(idx, pid, 'success')
    except Exception as e:
        print(idx, pid, e)
        abstract_embeddings[pid] = "OpenAI API error"
    
    if idx + 1 % 10 == 0:
        with open('abstrate_embeddings.json', 'w') as f:
            json.dump(abstract_embeddings, f)

with open('abstrate_embeddings.json', 'w') as f:
    json.dump(abstract_embeddings, f)

0 32643664 success
1 32935873 success
2 33710597 success
3 34951374 success
4 33920904 success
5 33803475 success
6 36749239 success
7 32937949 success
8 35138001 success
9 32751841 success
10 33852526 success
11 32859864 success
12 32609336 success
13 35807384 success
14 33178109 success
15 36186752 success
16 35772604 success
17 35072534 success
18 32458193 success
19 36180640 success
20 33363165 success
21 35699161 success
22 36031313 success
23 33929617 success
24 35034236 success
25 34408638 success
26 35770683 success
27 37261613 success
28 32610334 success
29 33268588 success
30 37131407 success
31 33106782 success
32 35257690 success
33 32839585 success
34 33521692 success
35 34577633 success
36 37649125 success
37 32530389 success
38 34901061 success
39 32574248 success
40 36208899 success
41 33177481 success
42 33304309 success
43 33052822 success
44 35121209 success
45 37575318 success
46 33250631 success
47 32422545 success
48 33254519 success
49 33602080 success
50 3265548

In [34]:
quary = "What neurological sequelae (manifestations) after COVID-19 (coronavirus) infection have been reported?"
quary_embedding = openai.Embedding.create(
    input=quary,
    engine="text-embedding-ada-002-test",
)['data'][0]['embedding']

In [35]:
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y)
pid_ralatedness = {}
for pid, embedding in abstract_embeddings.items():
    pid_ralatedness[pid] = relatedness_fn(quary_embedding, embedding)

In [38]:
pid_ralatedness_ = list(pid_ralatedness.items())
pid_ralatedness_.sort(key=lambda x: x[1], reverse=True)


# print top 10 relatedness
for pid, relatedness in pid_ralatedness_[:30]:
    print(pid, relatedness, pid_title[pid], pid_abstrate[pid], sep='\n===============================\n')

35633158
0.9116670440087294
Neurological Sequelae of COVID-19.
BACKGROUND: Though primarily a pulmonary disease, Coronavirus disease 2019 (COVID-19) caused by the SARS-CoV-2 virus can generate devastating disease states that affect multiple organ systems including the central nervous system (CNS). The various neurological disorders associated with COVID-19 range in severity from mild symptoms such as headache, or myalgias to more severe symptoms such as stroke, psychosis, and anosmia. While some of the COVID-19 associated neurological complications are mild and reversible, a significant number of patients suffer from stroke. Studies have shown that COVID-19 infection triggers a wave of inflammatory cytokines that induce endothelial cell dysfunction and generate coagulopathy that increases the risk of stroke or thromboses. Inflammation of the endothelium following infection may also destabilize atherosclerotic plaque and induce thrombotic stroke. Although uncommon, there have also been 

In [39]:
top30_pids = [pid for pid, relatedness in pid_ralatedness_[:30]]

# token count for top 30
sum([pid_token_count[pid] for pid in top30_pids])

10189

In [40]:
related_abstracts = []
for pid in top30_pids:
    related_abstracts.append(f'Title: {pid_title[pid]}\nAbstract: {pid_abstrate[pid]}\n---\n')
related_abstracts = ''.join(related_abstracts)

In [43]:
# count token of related abstracts
len(tokenizer.encode(related_abstracts))

10219

In [44]:
prompt = f"""Use the below abstracts of research papers from Long covid database to answer the subsequent question. If the answer cannot be found, write "I don't know."

Abstracts of research papers:
\"\"\"
{related_abstracts}
\"\"\"

Question: {quary}"""

In [45]:
len(tokenizer.encode(prompt))

10285

In [47]:
response = openai.ChatCompletion.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about COVID-19 base on the abstrates of research papers.'},
        {'role': 'user', 'content': prompt},
    ],
    model='gpt-3.5-turbo-16k',
    temperature=0,
)

In [49]:
print(response['choices'][0]['message']['content'])

The neurological sequelae (manifestations) reported after COVID-19 infection include:

1. Headache
2. Anosmia (loss of smell)
3. Encephalopathy
4. Encephalitis
5. Psychosis
6. Stroke
7. Brain fog
8. Depression
9. Anxiety
10. Guillain-Barre syndrome
11. Sleep disorders
12. Cognitive dysfunction
13. Dysautonomia
14. Peripheral neuropathies
15. Seizures
16. Myalgia (muscle pain)
17. Meningitis
18. Acute necrotizing encephalopathy
19. Cerebrovascular diseases (ischemic and hemorrhagic stroke)
20. Autoimmune neuropathies

It is important to note that the incidence and severity of these neurological complications can vary among individuals, and further research is needed to fully understand the mechanisms and long-term effects of these manifestations.
