In [1]:
import json
import openai
import os
import tiktoken
from scipy import spatial  # for calculating vector similarities for search

In [None]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [36]:
def get_embedding(text):
    try:
        response = openai.Embedding.create(
                input=text,
                engine="text-embedding-ada-002-test",
            )
        embedding = response['data'][0]['embedding']
        assert len(embedding) == 1536
        return embedding
    except Exception as e:
        embedding = None
        return embedding
    
def get_chat_completion(prompt, model='gpt-3.5-turbo-16k', system_prompt='You answer questions about COVID-19 base on the abstrates of research papers.'):
    try:
        response = openai.ChatCompletion.create(
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': prompt},
            ],
            model=model,
            temperature=0,
        )

        return response['choices'][0]['message']['content']
    
    except Exception as e:
        return None


In [8]:
# load abstracts and their embeddings
with open('abstrate_embeddings.json') as f:
    abstrates_embeddings = json.load(f)

with open('pid_abstract_final.json') as f:
    pid_abstrate = json.load(f)

with open('pid_title.json') as f:
    pid_title = json.load(f)

pid_content_relate_label = {}
with open('topic02.csv') as f:
    lines = f.read().splitlines()

for line in lines:
    pid, label = line.split(',')
    pid_content_relate_label[pid] = int(label)

In [9]:
# function to calculate similarity between two embeddings
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y)

In [21]:
# function to get the top related abstracts
def get_top_related_abstracts(query_embedding, top_n=20):
    related_abstracts = []
    for pid, embedding in abstrates_embeddings.items():
        relatedness = relatedness_fn(query_embedding, embedding)
        related_abstracts.append((pid, relatedness))

    related_abstracts.sort(key=lambda x: x[1], reverse=True)
    return related_abstracts[:top_n]

# function to make related abstracts into a text
def make_related_abstracts_text(pids):
    related_abstracts = []
    for pid in pids:
        related_abstracts.append(f'Title: {pid_title[pid]}\nAbstract: {pid_abstrate[pid]}\n---\n')
    related_abstracts = ''.join(related_abstracts)
    return related_abstracts


# function to generate the prompt for the GPT model
def generate_prompt(related_abstracts, query, format_prompt=None):
    prompt = f"""Use the below abstracts of research papers from Long covid database to answer the subsequent question. If the answer cannot be found, write "I don't know."
Abstracts of research papers:
\"\"\"
{related_abstracts}
\"\"\"

Question: {query}"""
    if format_prompt is not None:
        prompt += f"\n\nFormat: {format_prompt}"
    return prompt

# function to count prompt tokens
def count_prompt_tokens(prompt, model='gpt-3.5-turbo-16k'):
    tokenizer = tiktoken.encoding_for_model(model)
    tokens = tokenizer.encode(prompt)
    return len(tokens)

### Query1: What neurological sequelae (manifestations) of COVID-19 (coronavirus) infection have been reported in current research?

In [16]:
query1 = "What neurological sequelae (manifestations) of COVID-19 (coronavirus) infection have been reported in current research?"
query_embedding = get_embedding(query1)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open('related_abstracts_query1.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query1)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6657


In [17]:
print(get_chat_completion(prompt))

The neurological sequelae (manifestations) of COVID-19 infection that have been reported in current research include headache, dizziness, depression, anosmia (loss of smell), encephalitis (inflammation of the brain), stroke, epileptic seizures, Guillain-Barre syndrome, altered mental status, impaired consciousness, cranial nerve manifestations, cognitive dysfunction, dysautonomia, myalgia (muscle pain), and peripheral neuropathies.


In [22]:
# try format prompt
prompt = generate_prompt(related_abstracts, query1, format_prompt='\nPlease answer in json format.')
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6666


In [29]:
result = get_chat_completion(prompt)
print(result)

{
  "neurological_sequelae": [
    "headache",
    "dizziness",
    "depression",
    "anosmia",
    "encephalitis",
    "stroke",
    "epileptic seizures",
    "Guillain-Barre syndrome",
    "impaired consciousness",
    "cranial nerve manifestations",
    "autoimmune disorders",
    "cognitive dysfunction",
    "dysautonomia",
    "myalgia",
    "hyposmia",
    "dysgeusia",
    "muscle pain",
    "acute cerebrovascular disease",
    "seizures",
    "encephalopathy",
    "delirium",
    "cerebrovascular accidents",
    "acute transverse myelitis",
    "acute encephalitis",
    "posterior reversible encephalopathy syndrome (PRES)",
    "encephalopathy",
    "meningoencephalitis",
    "psychosis",
    "brain fog",
    "Guillain-Barre syndrome",
    "depression",
    "anxiety",
    "ischemic stroke",
    "hemorrhagic stroke",
    "cognitive dysfunction",
    "dysautonomia",
    "anosmia",
    "ageusia",
    "encephalitis",
    "Guillain-Barre syndrome",
    "headache",
    "peripheral ne

In [30]:
result = json.loads(result)

In [32]:
result['neurological_sequelae']

['headache',
 'dizziness',
 'depression',
 'anosmia',
 'encephalitis',
 'stroke',
 'epileptic seizures',
 'Guillain-Barre syndrome',
 'impaired consciousness',
 'cranial nerve manifestations',
 'autoimmune disorders',
 'cognitive dysfunction',
 'dysautonomia',
 'myalgia',
 'hyposmia',
 'dysgeusia',
 'muscle pain',
 'acute cerebrovascular disease',
 'seizures',
 'encephalopathy',
 'delirium',
 'cerebrovascular accidents',
 'acute transverse myelitis',
 'acute encephalitis',
 'posterior reversible encephalopathy syndrome (PRES)',
 'encephalopathy',
 'meningoencephalitis',
 'psychosis',
 'brain fog',
 'Guillain-Barre syndrome',
 'depression',
 'anxiety',
 'ischemic stroke',
 'hemorrhagic stroke',
 'cognitive dysfunction',
 'dysautonomia',
 'anosmia',
 'ageusia',
 'encephalitis',
 'Guillain-Barre syndrome',
 'headache',
 'peripheral neuropathies',
 'seizures',
 'acute cerebrovascular disease',
 'encephalopathy',
 'hyposmia',
 'anosmia',
 'ageusia',
 'muscle pain',
 'Guillain-Barre syndrome

In [33]:
with open('result_query1.json', 'w') as f:
    json.dump(result, f, indent=4)

In [34]:
len(result['neurological_sequelae']), len(set(result['neurological_sequelae']))

(95, 53)

In [37]:
sequelae = list(set(result['neurological_sequelae']))
sequelae

['autoimmune peripheral neuropathies',
 'nonspecific headache',
 'anxiety',
 'acute disseminated encephalomyelitis',
 'headache',
 'autoimmune disorders',
 'meningoencephalitis',
 'acute transverse myelitis',
 'seizures',
 'cerebrovascular accidents',
 'hypoxic and metabolic abnormalities',
 'ischemic stroke',
 'dizziness',
 'encephalitis',
 'altered consciousness',
 'personal protection equipment-related headache',
 'neuropathy',
 'encephalopathy',
 'coagulopathies',
 'ageusia',
 'acute encephalitis',
 'agitation',
 'Guillain-Barre syndrome',
 'posterior reversible encephalopathy syndrome (PRES)',
 'SARS-CoV-2 virus encephalitis',
 'dysgeusia',
 'coma',
 'cytokine storm',
 'depression',
 'psychosis',
 'epileptic seizures',
 'altered sensorium',
 'fatigue',
 'myalgia',
 'acute cerebrovascular disease',
 'hyposmia',
 'stroke',
 'cognitive dysfunction',
 'weakness',
 'sleep disturbances',
 'acute necrotizing encephalopathy',
 'ischemic or hemorrhagic stroke',
 'hemorrhagic stroke',
 'dys

In [44]:
prompt = f'''Here are the neurological sequelae of COVID-19 infection reported in current research. Please classify them into the categories based on your knowledge.
Sequelae list:
{sequelae}

Format: please use json format to answer the question. For example, {'{"categories1": ["sequelae1", "sequelae2", ...], "categories2": ["sequelae3", ...], ...}'}'''

token_count = count_prompt_tokens(prompt)
print('token count:', token_count)


token count: 404


In [45]:
result = get_chat_completion(prompt=prompt, model='gpt-4', system_prompt='You are an expert in medical field and help users do a variety of tasks based on your expertise.')

In [48]:
result = json.loads(result)

In [50]:
result

{'Neurological Disorders': ['autoimmune peripheral neuropathies',
  'acute disseminated encephalomyelitis',
  'meningoencephalitis',
  'acute transverse myelitis',
  'seizures',
  'cerebrovascular accidents',
  'ischemic stroke',
  'encephalitis',
  'encephalopathy',
  'acute encephalitis',
  'Guillain-Barre syndrome',
  'posterior reversible encephalopathy syndrome (PRES)',
  'SARS-CoV-2 virus encephalitis',
  'acute necrotizing encephalopathy',
  'ischemic or hemorrhagic stroke',
  'hemorrhagic stroke',
  'dysautonomia',
  'peripheral neuropathies',
  'demyelination',
  'cranial nerve manifestations',
  'anosmia',
  'impaired consciousness'],
 'Psychiatric Disorders': ['anxiety',
  'depression',
  'psychosis',
  'agitation',
  'delirium'],
 'Symptoms and Signs': ['nonspecific headache',
  'headache',
  'dizziness',
  'altered consciousness',
  'personal protection equipment-related headache',
  'neuropathy',
  'ageusia',
  'dysgeusia',
  'coma',
  'altered sensorium',
  'fatigue',
  

In [49]:
with open('sequelae_categories.json', 'w') as f:
    json.dump(result, f, indent=4)

### Query2: What are the risk factors for neurological sequelae (manifestations) of COVID-19 (coronavirus) infection?

In [52]:
query2 = "What are the risk factors for neurological sequelae (manifestations) of COVID-19 (coronavirus) infection?"

In [53]:
query_embedding = get_embedding(query2)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open('related_abstracts_query2.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query2)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6884


In [56]:
result = get_chat_completion(prompt)

In [57]:
result

'The abstracts do not specifically mention the risk factors for neurological sequelae of COVID-19 infection.'

In [74]:
query2_2 = "What impact factors (such as age, gender, ...) have been found for neurological sequelae (manifestations) of COVID-19 (coronavirus) infection?"

In [76]:
query_ = query2_2
query_num = "2_2"
query_embedding = get_embedding(query_)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open(f'related_abstracts_query{query_num}.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query_)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6898


In [77]:
result = get_chat_completion(prompt)

In [78]:
print(result)

Based on the abstracts of the research papers, the following impact factors have been found for neurological sequelae (manifestations) of COVID-19 infection:

1. Age: Older individuals are more susceptible to developing life-threatening COVID-19 and cerebrovascular disease, such as stroke. There is also a mild but inverse correlation with age for central nervous system (CNS) inflammatory diseases, such as encephalitis, as well as taste and/or smell disorders.

2. Chronic neurological conditions: Patients with pre-existing neurological conditions may be at an elevated risk for COVID-19-associated neurological symptoms.

3. Severity of COVID-19: Neurological symptoms are more common in patients with severe COVID-19 infection.

4. Comorbidities: Increased age is associated with comorbid cardiovascular risk factors, including hypertension, diabetes mellitus, and lipid disorders. Obesity is correlated with the development of critical COVID-19.

5. Pregnancy: COVID-19 infection during pregna

In [111]:
query2_2_2 = "What impact factors (such as age, gender, ...) have been found for neurological sequelae (manifestations) of COVID-19 (coronavirus) infection?"
query_ = query2_2_2
query_num = "2_2_2"
query_embedding = get_embedding(query_)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open(f'related_abstracts_query{query_num}.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query_)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6898


In [112]:
result = get_chat_completion(prompt, model='gpt-4')

In [113]:
print(result)

Based on the abstracts, the following factors have been found to impact the neurological sequelae of COVID-19 infection:

1. Age: Older individuals are more susceptible to developing life-threatening COVID-19 and cerebrovascular disease, such as stroke. A mild but inverse correlation with age was seen with CNS inflammatory diseases, such as encephalitis, as well as taste and/or smell disorders.

2. Chronic Underlying Disease: Children with a severe chronic underlying disease have a higher incidence of neurological complications.

3. Infants and Toddlers: Infants and toddlers have a higher incidence of neurological complications.

4. Multisystem Inflammatory Syndrome (MIS-C): Patients who develop MIS-C have a higher incidence of neurological complications.

5. Pregnancy: COVID-19 infection during pregnancy has been found associated with an increased risk of obstetric complications that can lead to neurological acute and long-term manifestations in neonates.

6. Comorbidities: Increased 

In [114]:
with open('result_query2_2_2.txt', 'w') as f:
    f.write(result)

In [97]:
format_prompt = '''
Please use the following json format to answer the question
{"impact_factors1": {"description": "A description of the how impact_factor1 impact the neurological sequelae of COVID-19 infection", "titles": [ "title of papers that mention impact_factor1", ...]}, "impact_factors2": {...}, ...}

For example: {"Age": {"description": "Older individuals are more susceptible to developing life-threatening COVID-19 and cerebrovascular disease, such as stroke. A mild but inverse correlation with age was seen with CNS inflammatory diseases, such as encephalitis, as well as taste and/or smell disorders.", "titles": ["Potential neurological manifestations of COVID-19: a narrative review."]}}
'''
prompt = generate_prompt(related_abstracts, query_, format_prompt=format_prompt)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 7045


In [98]:
print(prompt)

Use the below abstracts of research papers from Long covid database to answer the subsequent question. If the answer cannot be found, write "I don't know."
Abstracts of research papers:
"""
Title: Are we sure that the neurological impact of COVID 19 in childhood has not been underestimated?
Abstract: BACKGROUND: Presently, it is known that, even if less frequently than in adults, children can develop a severe new coronavirus disease 2019 (COVID-19). Children with the SARS-CoV-2 infection can have neurological signs and symptoms of disease more frequently than previously thought, revealing the involvement of the central nervous system, the peripheral nervous system, or both. Aim of this manuscript is to highlight the neurologic complications associated with SARS-CoV-2 among pediatric patients with COVID-19, suggesting when to monitor carefully neurologic development.MAIN FINDINGS: Children with a severe chronic underlying disease, infants and toddlers and those who develop the so-called

In [99]:
result = get_chat_completion(prompt, model='gpt-4')

In [103]:
result = json.loads(result)

In [104]:
result.keys()

dict_keys(['Age', 'Gender', 'Pre-existing neurological conditions', 'Severity of COVID-19 infection', 'Pregnancy'])

In [105]:
with open('impact_factors.json', 'w') as f:
    json.dump(result, f, indent=4)

In [116]:
prompt = f'''The following factors have been found to impact the neurological sequelae of COVID-19 infection:
1. Age: Older individuals are more susceptible to developing life-threatening COVID-19 and cerebrovascular disease, such as stroke. A mild but inverse correlation with age was seen with CNS inflammatory diseases, such as encephalitis, as well as taste and/or smell disorders.

2. Pre-existing neurological conditions: Patients with pre-existing neurological conditions may be at elevated risk for COVID-19-associated neurological symptoms.

3. Severity of COVID-19 infection: Neurological symptoms are more common in patients with severe infection according to their respiratory status.

4. Chronic underlying disease in children: Children with a severe chronic underlying disease, infants and toddlers, and those who develop the so-called multisystem inflammatory syndrome (MIS-C) are those with the highest incidence of neurological complications.

5. Pregnancy: COVID-19 infection during pregnancy has been found associated with an increased risk of obstetric complications that can lead to neurological acute and long-term manifestations in neonates.

6. Comorbid cardiovascular risk factors: Increased age was also associated with comorbid cardiovascular risk factors, including hypertension, diabetes mellitus, and lipid disorders, but not with obesity. 

7. Obesity: Obesity did correlate with the development of critical COVID-19.

These factors are found in the following abstracts of papers:
{related_abstracts}

Please find the corresponding abstracts of papers for each factor and write them in the following json format:
{'{"Age": ["title of papers that mention Age", ...], "Pre-existing neurological conditions": [...], ...}'}
'''

token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 7142


In [117]:
result = get_chat_completion(prompt, model='gpt-4', system_prompt='You are an expert in medical field and help users do a variety of tasks based on your expertise.')

In [119]:
result = json.loads(result)

In [121]:
result

{'Age': ['Age-Associated Neurological Complications of COVID-19: A Systematic Review and Meta-Analysis.',
  'Neurological Sequelae of COVID-19.',
  'Neurological Aspects of SARS-CoV-2 Infection: Mechanisms and Manifestations.',
  'Neurological manifestations of COVID-19: A brief review.',
  'Long-Term Effects of SARS-CoV-2 in the Brain: Clinical Consequences and Molecular Mechanisms.'],
 'Pre-existing neurological conditions': ['Neurological manifestations of COVID-19: with emphasis on Iranian patients.',
  'Neuropsychiatric Disorders and COVID-19: What We Know So Far.',
  'Neurological manifestations of COVID-19: A brief review.'],
 'Severity of COVID-19 infection': ['Neurological manifestations of COVID-19: with emphasis on Iranian patients.',
  'Neurological manifestations of COVID-19: A brief review.',
  'Neurological Sequelae of COVID-19.',
  'Neurological Aspects of SARS-CoV-2 Infection: Mechanisms and Manifestations.',
  'Neurological manifestations of COVID-19: A brief review.'

In [120]:
with open('impact_factors_title.json', 'w') as f:
    json.dump(result, f, indent=4)

### Query3: What COVID-19 (coronavirus) subtypes have been found to cause neurological sequelae (manifestations) in existing research?

In [58]:
query3 = "What COVID-19 (coronavirus) subtypes have been found to cause neurological sequelae (manifestations) in existing research?"

In [59]:
query_ = query3
query_num = 3
query_embedding = get_embedding(query_)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open(f'related_abstracts_query{query_num}.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query_)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6700


In [61]:
result = get_chat_completion(prompt)

In [63]:
print(result)

Based on the abstracts of the research papers, the COVID-19 subtypes that have been found to cause neurological sequelae (manifestations) in existing research are:

1. Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2) - mentioned in multiple abstracts.
2. Severe Acute Respiratory Syndrome Coronavirus (SARS-CoV-1) - mentioned in one abstract.
3. Middle East Respiratory Syndrome Coronavirus (MERS-CoV) - mentioned in one abstract.

Please note that these are the subtypes mentioned in the abstracts provided, and there may be other subtypes that have been studied in different research papers.


In [65]:
query3_2 = "What COVID-19 (SARS-CoV-2) subtypes have been found to cause neurological sequelae (manifestations) in existing research?"
query_ = query3_2
query_num = "3_2"
query_embedding = get_embedding(query_)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open(f'related_abstracts_query{query_num}.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query_)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6661


In [66]:
result = get_chat_completion(prompt)

In [68]:
print(result)

Based on the abstracts of the research papers, the neurological manifestations of COVID-19 have been associated with various subtypes of SARS-CoV-2. These include:

1. Encephalitis
2. Meningitis
3. Acute cerebrovascular disease (including stroke)
4. Guillain-Barré Syndrome (GBS)
5. Hyposmia (reduced sense of smell)
6. Hypogeusia (reduced sense of taste)
7. Anosmia (loss of sense of smell)
8. Headache
9. Dizziness
10. Depression
11. Epileptic seizures
12. Cranial nerve palsies
13. Myalgia (muscle pain)
14. Delirium
15. Dysautonomia
16. Myopathy
17. Neuromuscular disorders
18. Cephalgia (headache)
19. Critical illness polyneuropathy
20. Ophthalmoparesis (weakness or paralysis of eye muscles)
21. Motor peripheral neuropathy

It is important to note that these are the reported neurological manifestations in the existing research, and further studies are needed to fully understand the neurological complications associated with COVID-19.


In [69]:
query3_3 = "What virus subtypes of SARS-CoV-2 have been found to cause neurological sequelae (manifestations) in existing research?"
query_ = query3_3
query_num = "3_3"
query_embedding = get_embedding(query_)
top20_related = get_top_related_abstracts(query_embedding, top_n=20)
top20_related_pids = [pid for pid, _ in top20_related]
related_abstracts = make_related_abstracts_text(pids=top20_related_pids)
with open(f'related_abstracts_query{query_num}.txt', 'w') as f:
    f.write(related_abstracts)
prompt = generate_prompt(related_abstracts, query_)
token_count = count_prompt_tokens(prompt)
print('token count:', token_count)

token count: 6807


In [70]:
result = get_chat_completion(prompt)

In [72]:
print(result)

Based on the abstracts of the research papers, the neurological manifestations of SARS-CoV-2 have been reported in patients infected with the virus causing COVID-19. However, the abstracts do not specifically mention the virus subtypes of SARS-CoV-2 that have been found to cause neurological sequelae. Therefore, based on the given information, it is not possible to determine the specific virus subtypes associated with neurological manifestations.
