# Final Model Testing


In [11]:
import web_crawler_data_set_up as wcd
from updated_rel_BM25 import BM25_updated_rel
from Final_Model import BM25_updated_qe

Gathering initial data and setting up problem

In [2]:
queries_labeled = [
    ("sudden fever body aches", "Flu"),
    ("difficulty breathing loss smell event", "Covid"),
    ("increased thirst unexpected weight loss", "Diabetes"),
    ("extreme fatigue normal sleep routine", "Addisons"),
    ("persistent sadness low energy", "Depression"),
    ("chest pain heart palpitations", "Cardiac Arrest"),
    ("wheezing exhaling worsened respiratory virus", "Asthma"),
    ("blurred vision blind spots halos around lights", "Glaucoma"),
    ("swollen lymph nodes tiny red spots skin easy bruising", "Leukemia"),
    ("bloody stool feel need pass stools bowels empty", "Crohns Disease")
    ]

In [3]:
websites = wcd.load_json("websites.json")
wcd.scrape_websites(websites, "doc_data.json")
doc_data = wcd.load_json("doc_data.json")

wcd.annotate_data(queries_labeled, doc_data, "annotated_data.json")

Model testing on original smaller dataset

In [35]:
relevance_data = wcd.load_json("annotated_data.json")
queries = [query[0] for query in queries_labeled]
doc_data = wcd.load_json("doc_data.json")

model = BM25_updated_rel(doc_data)
k_value = 5

for (query, label) in queries_labeled:
    top_k_docs = model.top_docs(query, k_value, metric="zero_to_five")
    doc_list = [(doc_id, score) for doc_id, score in top_k_docs]
    dcg = model.dcg(top_k_docs)
    print(f"Query: '{query}'")
    print(f"    DCG: {dcg}")
    print(f"    {label}: {doc_list}\n")


Query: 'sudden fever body aches'
    DCG: 75.22694038245787
    Flu: [('Flu2', 5), ('Flu3', 5), ('Add3', 5), ('Cov1', 4), ('Car2', 3)]

Query: 'difficulty breathing loss smell event'
    DCG: 44.41653439949567
    Covid: [('Cov1', 5), ('Cov5', 4), ('Flu1', 2), ('Flu2', 2), ('Cov3', 2)]

Query: 'increased thirst unexpected weight loss'
    DCG: 36.84537735663818
    Diabetes: [('Dia5', 5), ('Cov5', 2), ('Add1', 2), ('Add3', 2), ('Dep1', 2)]

Query: 'extreme fatigue normal sleep routine'
    DCG: 60.226886783190885
    Addisons: [('Dia4', 5), ('Add2', 4), ('Dep2', 4), ('Dep3', 4), ('Car3', 4)]

Query: 'persistent sadness low energy'
    DCG: 48.139240631789235
    Depression: [('Dep3', 5), ('Dep2', 4), ('Dep4', 3), ('Cro2', 3), ('Dia1', 2)]

Query: 'chest pain heart palpitations'
    DCG: 41.36909637092401
    Cardiac Arrest: [('Car1', 5), ('Car2', 3), ('Car3', 3), ('Flu2', 2), ('Cov1', 2)]

Query: 'wheezing exhaling worsened respiratory virus'
    DCG: 53.6866518607274
    Asthma: [('As

In [36]:
relevance_data = wcd.load_json("annotated_data.json")
queries = [query[0] for query in queries_labeled]
doc_data = wcd.load_json("doc_data.json")

model = BM25_updated_qe(doc_data, n=1)
k_value = 5

for (query, label) in queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"Query: '{query}'")
    print(f"    DCG: {ndcg}")
    print(f"    {label}: {doc_list}\n")


Query: 'sudden fever body aches'
    DCG: 0.7375861965459806
    Flu: [('Flu2', 5), ('Flu3', 5), ('Add3', 5), ('Cov1', 4), ('Car2', 3), ('Flu1', 2), ('Flu4', 2), ('Cov5', 2), ('Dep1', 2), ('Car1', 2), ('Car4', 2), ('Gla3', 2), ('Leu1', 2), ('Cro1', 2), ('Cro2', 2), ('Cro3', 2), ('Cov2', 1), ('Cov4', 1), ('Dia1', 1), ('Dia4', 1), ('Add1', 1), ('Add2', 1), ('Dep2', 1), ('Dep3', 1), ('Dep5', 1), ('Ast1', 1), ('Ast4', 1), ('Ast6', 1), ('Leu2', 1), ('Cro4', 1), ('Flu5', 0), ('Cov3', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add4', 0), ('Dep4', 0), ('Car3', 0), ('Car5', 0), ('Ast2', 0), ('Ast3', 0), ('Ast5', 0), ('Gla1', 0), ('Gla2', 0), ('Gla4', 0), ('Gla5', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro5', 0)]

Query: 'difficulty breathing loss smell event'
    DCG: 1.0
    Covid: [('Cov1', 5), ('Cov5', 4), ('Flu1', 2), ('Flu2', 2), ('Cov3', 2), ('Cov4', 2), ('Add2', 2), ('Dep2', 2), ('Dep3', 2), ('Car3', 2), ('Ast1', 2), ('Gla1', 2), ('Flu3', 1), ('Cov2', 1), ('Dia4', 1), ('Dia5', 1)

Using website seeds for webcrawling to add to collection size

In [4]:
# Gather websites for more documents
flu_seeds = ["https://www.who.int/news-room/fact-sheets/detail/influenza-(seasonal)",
    "https://www.mayoclinic.org/diseases-conditions/search-results?q=flu",
    "https://my.clevelandclinic.org/health/diseases/4335-influenza-flu",
    "https://www.healthline.com/health/flu-causes",
    "https://www.yalemedicine.org/conditions/flu",
    "https://www.cdc.gov/flu/symptoms/symptoms.htm"
    ]

covid_seeds = [
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019",
    "https://www.cdc.gov/coronavirus/2019-ncov/index.html",
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    ]

diabetes_seeds = [
    "https://www.medicalnewstoday.com/info/diabetes",
    "https://www.mayoclinic.org/diseases-conditions/diabetes/symptoms-causes/syc-20371444",
    "https://www.cdc.gov/diabetes/index.html",
    ]

addisons_seeds = ["https://www.niddk.nih.gov/health-information/endocrine-diseases/addisons-disease",
    "https://www.medicalnewstoday.com/articles/164648",
    "https://www.healthline.com/health/addisons-disease",
    "https://rarediseases.org/rare-diseases/addisons-disease/",
    "https://www.mayoclinic.org/diseases-conditions/addisons-disease/symptoms-causes/syc-20350293",
    "https://www.webmd.com/a-to-z-guides/addisons-disease#1",
    "https://rarediseases.info.nih.gov/diseases/5779/addisons-disease",
    "https://www.cedars-sinai.org/health-library/diseases-and-conditions/a/addisons-disease.html",
    "https://www.uptodate.com/contents/addisons-disease-clinical-manifestations-diagnosis-and-treatment",
    "https://patient.info/doctor/addisons-disease",
    "https://emedicine.medscape.com/article/116467-overview",
    "https://www.cdc.gov/genomics/resources/diseases/addisons.htm"
    ]   

depression_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/depression/symptoms-causes/syc-20356007",
    "https://www.webmd.com/depression/default.htm",
    "https://www.psychologytoday.com/us/basics/depression",
    "https://www.nimh.nih.gov/health/topics/depression/index.shtml",
    ]

cardiac_arrest_seeds = ["https://www.heart.org/en/health-topics/heart-attack",
    "https://www.healthline.com/health/heart-attack",
    "https://www.mayoclinic.org/diseases-conditions/sudden-cardiac-arrest/symptoms-causes/syc-20350634",
    "https://www.heart.org/en/health-topics/cardiac-arrest",
    "https://www.nhlbi.nih.gov/health-topics/sudden-cardiac-arrest",
    "https://www.medicinenet.com/sudden_cardiac_arrest/article.htm",
    "https://www.health.harvard.edu/heart-health/sudden-cardiac-arrest-what-you-need-to-know",
    "https://www.nhs.uk/conditions/cardiac-arrest/",
    "https://www.heart.org/en/news/2023/02/09/this-is-what-a-cardiac-arrest-looks-like-and-why-you-need-to-know"
    ]

asthma_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/asthma/symptoms-causes/syc-20369653",
    "https://www.webmd.com/asthma/default.htm",
    "https://www.lung.org/lung-health-diseases/lung-disease-lookup/asthma",
    "https://www.cdc.gov/asthma/index.html",
    "https://www.nhlbi.nih.gov/health-topics/asthma",
    "https://www.healthline.com/health/asthma",
    "https://www.medicalnewstoday.com/articles/323129",
    "https://www.aaaai.org/conditions-and-treatments/asthma",
    ]

glaucoma_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/glaucoma/symptoms-causes/syc-20372839",
    "https://www.webmd.com/eye-health/glaucoma/default.htm",
    "https://www.aao.org/eye-health/diseases/what-is-glaucoma",
    "https://www.glaucoma.org/glaucoma/",
    ]

leukemia_seeds = [
    "https://www.cancer.org/cancer/leukemia.html",
    "https://www.mayoclinic.org/diseases-conditions/leukemia/symptoms-causes/syc-20374373",
    "https://www.webmd.com/cancer/lymphoma/understanding-leukemia-basics",
    "https://www.lls.org/leukemia",
    "https://www.cancer.gov/types/leukemia",
    "https://www.cancer.net/cancer-types/leukemia-acute-lymphoblastic-all/statistics",
    "https://www.medicalnewstoday.com/articles/142595",
    "https://www.healthline.com/health/leukemia",
    ]

crohns_disease_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/crohns-disease/symptoms-causes/syc-20353304",
    "https://www.webmd.com/ibd-crohns-disease/default.htm",
    "https://www.crohnscolitisfoundation.org/what-is-crohns-disease",
    "https://www.cdc.gov/ibd/data-statistics.htm",
    "https://www.niddk.nih.gov/health-information/digestive-diseases/crohns-disease",
    "https://www.medicalnewstoday.com/articles/151620",
    "https://www.healthline.com/health/crohns-disease",
    "https://www.gastro.org/practice-guidance/gi-patient-center/topic/crohns-disease",
    ]

flu_websites = wcd.web_crawler(flu_seeds, "flu")
covid_websites = wcd.web_crawler(covid_seeds, "covid")
diabetes_websites = wcd.web_crawler(diabetes_seeds, "diabetes")
addisons_websites = wcd.web_crawler(addisons_seeds, "addisons")
depression_websites = wcd.web_crawler(depression_seeds, "depression")
cardiac_arrest_websites = wcd.web_crawler(cardiac_arrest_seeds, "cardiac")
asthma_websites = wcd.web_crawler(asthma_seeds, "asthma")
glaucoma_websites = wcd.web_crawler(glaucoma_seeds, "glaucoma")
leukemia_websites = wcd.web_crawler(leukemia_seeds, "leukemia")
crohns_disease_websites = wcd.web_crawler(crohns_disease_seeds, "crohns-disease")

wcd.update_websites_json("Flu", flu_websites, websites, "updated_websites.json")
wcd.update_websites_json("Covid", covid_websites, websites, "updated_websites.json")
wcd.update_websites_json("Diabetes", diabetes_websites, websites, "updated_websites.json")
wcd.update_websites_json("Addisons Disease", addisons_websites, websites, "updated_websites.json")
wcd.update_websites_json("Depression", depression_websites, websites, "updated_websites.json")
wcd.update_websites_json("Cardiac Arrest", cardiac_arrest_websites, websites, "updated_websites.json")
wcd.update_websites_json("Asthma", asthma_websites, websites, "updated_websites.json")
wcd.update_websites_json("Glaucoma", glaucoma_websites, websites, "updated_websites.json")
wcd.update_websites_json("Leukemia", leukemia_websites, websites, "updated_websites.json")
wcd.update_websites_json("Crohns Disease", crohns_disease_websites, websites, "updated_websites.json")

Get and annotate data for larger model collection

In [5]:
# Run larger model with updated document data
websites = wcd.load_json("updated_websites.json")
wcd.scrape_websites(websites, "updated_doc_data.json")
doc_data = wcd.load_json("updated_doc_data.json")
wcd.annotate_data(queries_labeled, doc_data, "updated_annotated_data.json")
relevance_data = wcd.load_json("updated_annotated_data.json")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


## Run Larger Model With k = 5, k = 10, k= 20

### With Updated Relevance Scores

In [54]:
model = BM25_updated_rel(doc_data)
k_value = 5

for (query, label) in queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.7375861965459806
Flu: [('Flu2', 5), ('Flu3', 5), ('Add3', 5), ('Cov1', 4), ('Car2', 3), ('Flu1', 2), ('Flu4', 2), ('Cov5', 2), ('Dep1', 2), ('Car1', 2), ('Car4', 2), ('Gla3', 2), ('Leu1', 2), ('Cro1', 2), ('Cro2', 2), ('Cro3', 2), ('Cov2', 1), ('Cov4', 1), ('Dia1', 1), ('Dia4', 1), ('Add1', 1), ('Add2', 1), ('Dep2', 1), ('Dep3', 1), ('Dep5', 1), ('Ast1', 1), ('Ast4', 1), ('Ast6', 1), ('Leu2', 1), ('Cro4', 1), ('Flu5', 0), ('Cov3', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add4', 0), ('Dep4', 0), ('Car3', 0), ('Car5', 0), ('Ast2', 0), ('Ast3', 0), ('Ast5', 0), ('Gla1', 0), ('Gla2', 0), ('Gla4', 0), ('Gla5', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro5', 0)]

NDCG: 1.0
Covid: [('Cov1', 5), ('Cov5', 4), ('Flu1', 2), ('Flu2', 2), ('Cov3', 2), ('Cov4', 2), ('Add2', 2), ('Dep2', 2), ('Dep3', 2), ('Car3', 2), ('Ast1', 2), ('Gla1', 2), ('Flu3', 1), ('Cov2', 1), ('Dia4', 1), ('Dia5', 1), ('Add1', 1), ('Add3', 1), ('Dep1', 1), ('Dep4', 1), ('Car1', 1), ('Car2', 1), ('Car4', 1), (

In [55]:
model = BM25_updated_rel(doc_data)
k_value = 10

for (query, label) in queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.7423491246772616
Flu: [('Flu2', 5), ('Flu3', 5), ('Add3', 5), ('Cov1', 4), ('Car2', 3), ('Flu1', 2), ('Flu4', 2), ('Cov5', 2), ('Dep1', 2), ('Car1', 2), ('Car4', 2), ('Gla3', 2), ('Leu1', 2), ('Cro1', 2), ('Cro2', 2), ('Cro3', 2), ('Cov2', 1), ('Cov4', 1), ('Dia1', 1), ('Dia4', 1), ('Add1', 1), ('Add2', 1), ('Dep2', 1), ('Dep3', 1), ('Dep5', 1), ('Ast1', 1), ('Ast4', 1), ('Ast6', 1), ('Leu2', 1), ('Cro4', 1), ('Flu5', 0), ('Cov3', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add4', 0), ('Dep4', 0), ('Car3', 0), ('Car5', 0), ('Ast2', 0), ('Ast3', 0), ('Ast5', 0), ('Gla1', 0), ('Gla2', 0), ('Gla4', 0), ('Gla5', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro5', 0)]

NDCG: 0.49755115985642684
Covid: [('Cov1', 5), ('Cov5', 4), ('Flu1', 2), ('Flu2', 2), ('Cov3', 2), ('Cov4', 2), ('Add2', 2), ('Dep2', 2), ('Dep3', 2), ('Car3', 2), ('Ast1', 2), ('Gla1', 2), ('Flu3', 1), ('Cov2', 1), ('Dia4', 1), ('Dia5', 1), ('Add1', 1), ('Add3', 1), ('Dep1', 1), ('Dep4', 1), ('Car1', 1), ('Car2', 1)

In [56]:
model = BM25_updated_rel(doc_data)
k_value = 20

for (query, label) in queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.7044525551454779
Flu: [('Flu2', 5), ('Flu3', 5), ('Add3', 5), ('Cov1', 4), ('Car2', 3), ('Flu1', 2), ('Flu4', 2), ('Cov5', 2), ('Dep1', 2), ('Car1', 2), ('Car4', 2), ('Gla3', 2), ('Leu1', 2), ('Cro1', 2), ('Cro2', 2), ('Cro3', 2), ('Cov2', 1), ('Cov4', 1), ('Dia1', 1), ('Dia4', 1), ('Add1', 1), ('Add2', 1), ('Dep2', 1), ('Dep3', 1), ('Dep5', 1), ('Ast1', 1), ('Ast4', 1), ('Ast6', 1), ('Leu2', 1), ('Cro4', 1), ('Flu5', 0), ('Cov3', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add4', 0), ('Dep4', 0), ('Car3', 0), ('Car5', 0), ('Ast2', 0), ('Ast3', 0), ('Ast5', 0), ('Gla1', 0), ('Gla2', 0), ('Gla4', 0), ('Gla5', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro5', 0)]

NDCG: 0.5135760561472501
Covid: [('Cov1', 5), ('Cov5', 4), ('Flu1', 2), ('Flu2', 2), ('Cov3', 2), ('Cov4', 2), ('Add2', 2), ('Dep2', 2), ('Dep3', 2), ('Car3', 2), ('Ast1', 2), ('Gla1', 2), ('Flu3', 1), ('Cov2', 1), ('Dia4', 1), ('Dia5', 1), ('Add1', 1), ('Add3', 1), ('Dep1', 1), ('Dep4', 1), ('Car1', 1), ('Car2', 1),

### With Query Expansion

In [45]:
updated_queries_labeled = [
    ("i suddenly got fever and my body aches", "Flu"),
    ("i am having difficulty breathing and lost the sense of smell", "Covid"),
    ("i am experiencing an increased amount of thirst and have had unexpected weight loss", "Diabetes"),
    ("i feel extreme fatigue but I have a normal sleeping routine", "Addisons"),
    ("i persistently feel sadness and have low energy", "Depression"),
    ("i am having a lot of chest pain and am experiencing heart palpitations", "Cardiac Arrest"),
    ("i wheeze when I exhale and feel that it is worsened by a respiratory virus", "Asthma"),
    ("i am experiencing blurred vision, see blind spots, and halos around lights", "Glaucoma"),
    ("i have swollen lymph nodes, tiny red spots on my skin, and I bruise easily", "Leukemia"),
    ("i feel the need to pass stools even though my bowls are empty, and when do pass stool, it is bloody", "Crohns Disease")
    ]

#### With ngrams = 2

In [46]:
model = BM25_updated_qe(doc_data, n=2)
k_value = 5

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.5337519330784549
Flu: [('Flu3', 5), ('Cov5', 3), ('Flu2', 2), ('Cov1', 2), ('Flu4', 1), ('Flu1', 0), ('Flu5', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 1.0
Covid: [('Cov5', 5), ('Dia4', 3), ('Flu1', 2), ('Flu2', 2), ('Cov1', 2), ('Cov4', 2), ('Gla4', 2), ('Cov3', 1), ('Add1', 1), ('Dep4', 1), ('Dep5', 1), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov2', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), (

In [48]:
model = BM25_updated_qe(doc_data, n=2)
k_value = 10

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.5419181087275041
Flu: [('Flu3', 5), ('Cov5', 3), ('Flu2', 2), ('Cov1', 2), ('Flu4', 1), ('Flu1', 0), ('Flu5', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 0.4338787043516948
Covid: [('Cov5', 5), ('Dia4', 3), ('Flu1', 2), ('Flu2', 2), ('Cov1', 2), ('Cov4', 2), ('Gla4', 2), ('Cov3', 1), ('Add1', 1), ('Dep4', 1), ('Dep5', 1), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov2', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0),

In [50]:
model = BM25_updated_qe(doc_data, n=2)
k_value = 20

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0.5419181087275041
Flu: [('Flu3', 5), ('Cov5', 3), ('Flu2', 2), ('Cov1', 2), ('Flu4', 1), ('Flu1', 0), ('Flu5', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 0.4424159973669237
Covid: [('Cov5', 5), ('Dia4', 3), ('Flu1', 2), ('Flu2', 2), ('Cov1', 2), ('Cov4', 2), ('Gla4', 2), ('Cov3', 1), ('Add1', 1), ('Dep4', 1), ('Dep5', 1), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov2', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia5', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0),

#### With ngrams = 3

In [51]:
model = BM25_updated_qe(doc_data, n=3)
k_value = 5

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0
Flu: [('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Cov5', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 0
Covid: [('Cov5', 5), ('Cov4', 1), ('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1'

In [52]:
model = BM25_updated_qe(doc_data, n=3)
k_value = 10

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0
Flu: [('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Cov5', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 0.2928159141598652
Covid: [('Cov5', 5), ('Cov4', 1), ('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('D

In [53]:
model = BM25_updated_qe(doc_data, n=3)
k_value = 20

for (query, label) in updated_queries_labeled:
    docs = model.top_docs(query, k_value, metric="zero_to_five", to_sort=False)
    
    sorted_top_docs = sorted(docs, key=lambda x: x[1], reverse = True)

    doc_list = [(doc_id, score) for doc_id, score in sorted_top_docs]

    ndcg = model.ndcg(docs[:k_value])
    print(f"NDCG: {ndcg}")
    print(f"{label}: {doc_list}\n")

NDCG: 0
Flu: [('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Cov4', 0), ('Cov5', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('Dep5', 0), ('Car1', 0), ('Car2', 0), ('Car3', 0), ('Car4', 0), ('Car5', 0), ('Ast1', 0), ('Ast2', 0), ('Ast3', 0), ('Ast4', 0), ('Ast5', 0), ('Ast6', 0), ('Gla1', 0), ('Gla2', 0), ('Gla3', 0), ('Gla4', 0), ('Gla5', 0), ('Leu1', 0), ('Leu2', 0), ('Leu3', 0), ('Leu4', 0), ('Leu5', 0), ('Cro1', 0), ('Cro2', 0), ('Cro3', 0), ('Cro4', 0), ('Cro5', 0)]

NDCG: 0.2928159141598652
Covid: [('Cov5', 5), ('Cov4', 1), ('Flu1', 0), ('Flu2', 0), ('Flu3', 0), ('Flu4', 0), ('Flu5', 0), ('Cov1', 0), ('Cov2', 0), ('Cov3', 0), ('Dia1', 0), ('Dia2', 0), ('Dia3', 0), ('Dia4', 0), ('Dia5', 0), ('Add1', 0), ('Add2', 0), ('Add3', 0), ('Add4', 0), ('Dep1', 0), ('Dep2', 0), ('Dep3', 0), ('Dep4', 0), ('D