In [2]:
import web_crawler_data_set_up as wcd
from BM25 import BM25

Gathering initial data and setting up problem

In [3]:
websites = wcd.load_json("websites.json")
wcd.scrape_websites(websites, "doc_data.json")
doc_data = wcd.load_json("doc_data.json")
queries_labeled = [
    ("sudden fever body aches", "Flu"),
    ("difficulty breathing loss smell event", "Covid"),
    ("increased thirst unexpected weight loss", "Diabetes"),
    ("extreme fatigue normal sleep routine", "Addisons"),
    ("persistent sadness low energy", "Depression"),
    ("chest pain heart palpitations", "Cardiac Arrest"),
    ("wheezing exhaling worsened respiratory virus", "Asthma"),
    ("blurred vision blind spots halos around lights", "Glaucoma"),
    ("swollen lymph nodes tiny red spots skin easy bruising", "Leukemia"),
    ("bloody stool feel need pass stools bowels empty", "Crohns Disease")
    ]
wcd.annotate_data(queries_labeled, doc_data, "annotated_data.json")

Model testing on original smaller dataset

In [17]:
relevance_data = wcd.load_json("annotated_data.json")
queries = [query[0] for query in queries_labeled]
model = BM25(doc_data)
k_value = 5
print(model.mean_avg_precision(queries, relevance_data, k_value))

for (query, label) in queries_labeled:
    top_k_docs = model.top_docs(query, k_value)
    doc_list = [(doc_id, score) for doc_id, score in top_k_docs]
    print(f"{label}: {doc_list}")


0.8733333333333334
Flu: [('Flu2', 7.282656090558841), ('Add3', 7.2056934598174704), ('Flu3', 6.731536274554822), ('Cov1', 5.330354178526775), ('Car2', 4.637879438524966)]
Covid: [('Cov1', 8.130651554305484), ('Cov5', 6.498813509152596), ('Flu2', 3.9803716719895545), ('Add2', 3.567379264541694), ('Ast1', 3.528635245337877)]
Diabetes: [('Dia5', 9.453971906671857), ('Cro4', 4.170029939563392), ('Dep1', 3.8268137995691553), ('Dep3', 3.7589755425739546), ('Cro1', 3.4617618379816046)]
Addisons: [('Dia4', 6.864605112355704), ('Dep3', 6.01821930390569), ('Car3', 6.009692429000052), ('Dep2', 5.469185544288353), ('Add2', 5.241511302794987)]
Depression: [('Dep3', 9.862588937173076), ('Dep2', 7.266303255431771), ('Dep4', 6.480532759916773), ('Cro2', 5.8055371084289185), ('Dep1', 4.82217258946266)]
Cardiac Arrest: [('Car1', 9.861215155910852), ('Car2', 5.216878300018213), ('Car3', 5.19923753670662), ('Flu2', 4.973921433958047), ('Dia1', 3.662913597616614)]
Asthma: [('Ast1', 11.167145113450816), ('C

Using website seeds for webcrawling to add to collection size

In [3]:
# Gather websites for more documents
flu_seeds = ["https://www.who.int/news-room/fact-sheets/detail/influenza-(seasonal)",
    "https://www.mayoclinic.org/diseases-conditions/search-results?q=flu",
    "https://my.clevelandclinic.org/health/diseases/4335-influenza-flu",
    "https://www.healthline.com/health/flu-causes",
    "https://www.yalemedicine.org/conditions/flu",
    "https://www.cdc.gov/flu/symptoms/symptoms.htm"
    ]

covid_seeds = [
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019",
    "https://www.cdc.gov/coronavirus/2019-ncov/index.html",
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    ]

diabetes_seeds = [
    "https://www.medicalnewstoday.com/info/diabetes",
    "https://www.mayoclinic.org/diseases-conditions/diabetes/symptoms-causes/syc-20371444",
    "https://www.cdc.gov/diabetes/index.html",
    ]

addisons_seeds = ["https://www.niddk.nih.gov/health-information/endocrine-diseases/addisons-disease",
    "https://www.medicalnewstoday.com/articles/164648",
    "https://www.healthline.com/health/addisons-disease",
    "https://rarediseases.org/rare-diseases/addisons-disease/",
    "https://www.mayoclinic.org/diseases-conditions/addisons-disease/symptoms-causes/syc-20350293",
    "https://www.webmd.com/a-to-z-guides/addisons-disease#1",
    "https://rarediseases.info.nih.gov/diseases/5779/addisons-disease",
    "https://www.cedars-sinai.org/health-library/diseases-and-conditions/a/addisons-disease.html",
    "https://www.uptodate.com/contents/addisons-disease-clinical-manifestations-diagnosis-and-treatment",
    "https://patient.info/doctor/addisons-disease",
    "https://emedicine.medscape.com/article/116467-overview",
    "https://www.cdc.gov/genomics/resources/diseases/addisons.htm"
    ]   

depression_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/depression/symptoms-causes/syc-20356007",
    "https://www.webmd.com/depression/default.htm",
    "https://www.psychologytoday.com/us/basics/depression",
    "https://www.nimh.nih.gov/health/topics/depression/index.shtml",
    ]

cardiac_arrest_seeds = ["https://www.heart.org/en/health-topics/heart-attack",
    "https://www.healthline.com/health/heart-attack",
    "https://www.mayoclinic.org/diseases-conditions/sudden-cardiac-arrest/symptoms-causes/syc-20350634",
    "https://www.heart.org/en/health-topics/cardiac-arrest",
    "https://www.nhlbi.nih.gov/health-topics/sudden-cardiac-arrest",
    "https://www.medicinenet.com/sudden_cardiac_arrest/article.htm",
    "https://www.health.harvard.edu/heart-health/sudden-cardiac-arrest-what-you-need-to-know",
    "https://www.nhs.uk/conditions/cardiac-arrest/",
    "https://www.heart.org/en/news/2023/02/09/this-is-what-a-cardiac-arrest-looks-like-and-why-you-need-to-know"
    ]

asthma_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/asthma/symptoms-causes/syc-20369653",
    "https://www.webmd.com/asthma/default.htm",
    "https://www.lung.org/lung-health-diseases/lung-disease-lookup/asthma",
    "https://www.cdc.gov/asthma/index.html",
    "https://www.nhlbi.nih.gov/health-topics/asthma",
    "https://www.healthline.com/health/asthma",
    "https://www.medicalnewstoday.com/articles/323129",
    "https://www.aaaai.org/conditions-and-treatments/asthma",
    ]

glaucoma_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/glaucoma/symptoms-causes/syc-20372839",
    "https://www.webmd.com/eye-health/glaucoma/default.htm",
    "https://www.aao.org/eye-health/diseases/what-is-glaucoma",
    "https://www.glaucoma.org/glaucoma/",
    ]

leukemia_seeds = [
    "https://www.cancer.org/cancer/leukemia.html",
    "https://www.mayoclinic.org/diseases-conditions/leukemia/symptoms-causes/syc-20374373",
    "https://www.webmd.com/cancer/lymphoma/understanding-leukemia-basics",
    "https://www.lls.org/leukemia",
    "https://www.cancer.gov/types/leukemia",
    "https://www.cancer.net/cancer-types/leukemia-acute-lymphoblastic-all/statistics",
    "https://www.medicalnewstoday.com/articles/142595",
    "https://www.healthline.com/health/leukemia",
    ]

crohns_disease_seeds = [
    "https://www.mayoclinic.org/diseases-conditions/crohns-disease/symptoms-causes/syc-20353304",
    "https://www.webmd.com/ibd-crohns-disease/default.htm",
    "https://www.crohnscolitisfoundation.org/what-is-crohns-disease",
    "https://www.cdc.gov/ibd/data-statistics.htm",
    "https://www.niddk.nih.gov/health-information/digestive-diseases/crohns-disease",
    "https://www.medicalnewstoday.com/articles/151620",
    "https://www.healthline.com/health/crohns-disease",
    "https://www.gastro.org/practice-guidance/gi-patient-center/topic/crohns-disease",
    ]

flu_websites = wcd.web_crawler(flu_seeds, "flu")
covid_websites = wcd.web_crawler(covid_seeds, "covid")
diabetes_websites = wcd.web_crawler(diabetes_seeds, "diabetes")
addisons_websites = wcd.web_crawler(addisons_seeds, "addisons")
depression_websites = wcd.web_crawler(depression_seeds, "depression")
cardiac_arrest_websites = wcd.web_crawler(cardiac_arrest_seeds, "cardiac")
asthma_websites = wcd.web_crawler(asthma_seeds, "asthma")
glaucoma_websites = wcd.web_crawler(glaucoma_seeds, "glaucoma")
leukemia_websites = wcd.web_crawler(leukemia_seeds, "leukemia")
crohns_disease_websites = wcd.web_crawler(crohns_disease_seeds, "crohns-disease")

wcd.update_websites_json("Flu", flu_websites, websites, "updated_websites.json")
wcd.update_websites_json("Covid", covid_websites, websites, "updated_websites.json")
wcd.update_websites_json("Diabetes", diabetes_websites, websites, "updated_websites.json")
wcd.update_websites_json("Addisons Disease", addisons_websites, websites, "updated_websites.json")
wcd.update_websites_json("Depression", depression_websites, websites, "updated_websites.json")
wcd.update_websites_json("Cardiac Arrest", cardiac_arrest_websites, websites, "updated_websites.json")
wcd.update_websites_json("Asthma", asthma_websites, websites, "updated_websites.json")
wcd.update_websites_json("Glaucoma", glaucoma_websites, websites, "updated_websites.json")
wcd.update_websites_json("Leukemia", leukemia_websites, websites, "updated_websites.json")
wcd.update_websites_json("Crohns Disease", crohns_disease_websites, websites, "updated_websites.json")

Get and annotate data for larger model collection

In [None]:
# Run larger model with updated document data
websites = wcd.load_json("updated_websites.json")
wcd.scrape_websites(websites, "updated_doc_data.json")
doc_data = wcd.load_json("updated_doc_data.json")
wcd.annotate_data(queries_labeled, doc_data, "updated_annotated_data.json")
relevance_data = wcd.load_json("updated_annotated_data.json")

Run larger model k = 5, k = 10, k= 20

In [19]:
model = BM25(doc_data)
k_value = 5
print(model.mean_avg_precision(queries, relevance_data, k_value))
for (query, label) in queries_labeled:
    top_k_docs = model.top_docs(query, k_value)
    doc_list = [(doc_id, score) for doc_id, score in top_k_docs]
    print(f"{label}: {doc_list}")

0.9037500000000002
Flu: [('Flu12', 11.884613106488445), ('Flu25', 9.717042191636448), ('Add11', 9.330352154248287), ('Flu2', 9.072388624049017), ('Flu4', 9.072388624049017)]
Covid: [('Cov10', 12.579916187228012), ('Cov12', 10.626769460697982), ('Ast10', 7.19556454479423), ('Car4', 7.19039769104845), ('Car10', 7.19039769104845)]
Diabetes: [('Dia23', 12.890966210687933), ('Dia41', 12.483063384138411), ('Add23', 9.811580862693923), ('Dia28', 9.373094296056117), ('Car8', 7.318984379107585)]
Addisons: [('Dia18', 11.150247346491817), ('Dep2', 9.638250831453469), ('Add15', 8.682918544256754), ('Leu25', 8.322779864072693), ('Cov30', 8.12514474030873)]
Depression: [('Dep23', 14.640904885620852), ('Dep20', 10.421341404711919), ('Dep10', 9.133429268887346), ('Cro2', 8.94570694727555), ('Dep9', 8.381024611756205)]
Cardiac Arrest: [('Car19', 14.314917625778714), ('Car23', 10.025191135910436), ('Car14', 8.25152719897616), ('Car1', 8.145255880655142), ('Car12', 8.145255880655142)]
Asthma: [('Ast23', 

In [20]:
model = BM25(doc_data)
k_value = 10
print(model.mean_avg_precision(queries, relevance_data, k_value))
for (query, label) in queries_labeled:
    top_k_docs = model.top_docs(query, k_value)
    doc_list = [(doc_id, score) for doc_id, score in top_k_docs]
    print(f"{label}: {doc_list}")

0.8794133282942805
Flu: [('Flu12', 11.884613106488445), ('Flu25', 9.717042191636448), ('Add11', 9.330352154248287), ('Flu2', 9.072388624049017), ('Flu4', 9.072388624049017), ('Flu5', 9.072388624049017), ('Flu10', 9.072388624049017), ('Flu11', 9.072388624049017), ('Flu14', 9.072388624049017), ('Flu15', 9.072388624049017)]
Covid: [('Cov10', 12.579916187228012), ('Cov12', 10.626769460697982), ('Ast10', 7.19556454479423), ('Car4', 7.19039769104845), ('Car10', 7.19039769104845), ('Car17', 7.19039769104845), ('Car32', 6.8020878367231), ('Gla6', 5.847050295402177), ('Gla29', 5.78106058122299), ('Flu25', 5.780693156412719)]
Diabetes: [('Dia23', 12.890966210687933), ('Dia41', 12.483063384138411), ('Add23', 9.811580862693923), ('Dia28', 9.373094296056117), ('Car8', 7.318984379107585), ('Dep9', 6.894505030763641), ('Cro18', 6.041141700373215), ('Gla29', 5.921850809137401), ('Dep35', 5.674131064143256), ('Dep21', 5.4869268968409735)]
Addisons: [('Dia18', 11.150247346491817), ('Dep2', 9.63825083145

In [23]:
model = BM25(doc_data)
k_value = 20
print(model.mean_avg_precision(queries, relevance_data, k_value))
for (query, label) in queries_labeled:
    top_k_docs = model.top_docs(query, k_value)
    doc_list = [(doc_id, score) for doc_id, score in top_k_docs]
    print(f"{label}: {doc_list}")

0.7782362997300725
Flu: [('Flu12', 11.884613106488445), ('Flu25', 9.717042191636448), ('Add11', 9.330352154248287), ('Flu2', 9.072388624049017), ('Flu4', 9.072388624049017), ('Flu5', 9.072388624049017), ('Flu10', 9.072388624049017), ('Flu11', 9.072388624049017), ('Flu14', 9.072388624049017), ('Flu15', 9.072388624049017), ('Flu24', 9.072388624049017), ('Flu29', 9.072388624049017), ('Car9', 7.514019890974677), ('Cov35', 7.192605043736835), ('Cov10', 6.812826100881686), ('Car8', 6.764884864847776), ('Car1', 6.731475470731808), ('Car12', 6.731475470731808), ('Flu20', 6.709025690775732), ('Car15', 6.396875797764801)]
Covid: [('Cov10', 12.579916187228012), ('Cov12', 10.626769460697982), ('Ast10', 7.19556454479423), ('Car4', 7.19039769104845), ('Car10', 7.19039769104845), ('Car17', 7.19039769104845), ('Car32', 6.8020878367231), ('Gla6', 5.847050295402177), ('Gla29', 5.78106058122299), ('Flu25', 5.780693156412719), ('Car7', 5.421114170613061), ('Ast23', 5.137587340294431), ('Ast8', 5.015591169