In [1]:
import random
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
import re
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer, util
import textwrap
import time
import fitz
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
### Read file and preprocessing
exclude = string.punctuation

### Loại bỏ dấu câu
def remove_punctuation(text) -> str:
    return text.translate(str.maketrans('', '', exclude))

### Loại bỏ các file html
def remove_html_tags(text):
    pattern = re.compile(r"https?:\S|www\.\S")
    return pattern.sub(r'', text)

### Loại bỏ stopwords
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
        else:
            new_text.append('')
    
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def format_text(text: str) -> str:
    text = remove_html_tags(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

pdf_path = "human-nutrition-text.pdf"

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = format_text(text)
        pages_and_texts.append({"page_number": page_number + 1,
                                "page_char_count": len(text),
                                "page_token_count": len(text) / 4,
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),                          
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path)

1208it [00:15, 78.42it/s]


In [3]:
random.sample(pages_and_texts, k=3)

[{'page_number': 505,
  'page_char_count': 1208,
  'page_token_count': 302.0,
  'page_word_count': 240,
  'page_sentence_count_raw': 1,
  'text': 'Metabolic pathways   cell     metabolically efficient   cell  synthesize fatty acids  break      time Catabolism  food molecules begins  food enters  mouth   enzyme salivary amylase initiates  breakdown   starch  foods The entire process  digestion converts  large polymers  food  monomers    absorbed Starches  broken   monosaccharides lipids  broken   fatty acids  proteins  broken   amino acids These monomers  absorbed   bloodstream either directly    case  monosaccharides  amino acids  repackaged  intestinal cells  transport   indirect route  lymphatic vessels    case   fatty acids   fatsoluble molecules Once absorbed watersoluble nutrients first travel   liver  controls  passage   blood  transports  nutrients  cells throughout  body The fatsoluble nutrients gradually pass   lymphatic vessels  blood flowing  body cells Cells requiring energ

In [4]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_token_count,page_word_count,page_sentence_count_raw,text
0,1,28,7.0,4,1,Human Nutrition 2020 Edition
1,2,0,0.0,1,1,
2,3,295,73.75,42,1,Human Nutrition 2020 Edition UNIVERSITY OF HAW...
3,4,184,46.0,30,1,Human Nutrition 2020 Edition University Hawa...
4,5,697,174.25,116,1,Contents Preface University Hawai‘i Mānoa Fo...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_token_count,page_word_count,page_sentence_count_raw
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,926.29,231.57,170.35,1.0
std,348.86,451.54,112.88,86.18,0.0
min,1.0,0.0,0.0,1.0,1.0
25%,302.75,613.75,153.44,107.75,1.0
50%,604.5,996.0,249.0,181.0,1.0
75%,906.25,1294.25,323.56,236.0,1.0
max,1208.0,1833.0,458.25,392.0,1.0


In [6]:
### Use nltk to partition sentence from each page

for item in tqdm(pages_and_texts):
    item["sentences"] = list(sent_tokenize(item["text"]))

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:00<00:00, 56997.96it/s]


In [7]:
random.sample(pages_and_texts, k=2)

[{'page_number': 560,
  'page_char_count': 669,
  'page_token_count': 167.25,
  'page_word_count': 134,
  'page_sentence_count_raw': 1,
  'text': 'Image  Allison Calabrese CC BY 40 One major difference  fatsoluble vitamins  water soluble vitamins   way   absorbed   body Vitamins  absorbed primarily   small intestine   bioavailability  dependent   food composition   diet Fatsoluble vitamins  absorbed along  dietary fat Therefore   meal   low  fat  absorption   fatsoluble vitamins   impaired Once fatsoluble vitamins   absorbed   small intestine   packaged  incorporated  chylomicrons along   fatty acids  transported   lymphatic system   liver Watersoluble vitamins    hand  absorbed   small intestine   transported   liver  blood vessels Figure 92 “Absorption  FatSoluble  WaterSoluble Vitamins 518 Introduction',
  'sentences': ['Image  Allison Calabrese CC BY 40 One major difference  fatsoluble vitamins  water soluble vitamins   way   absorbed   body Vitamins  absorbed primarily   small int

In [8]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_token_count,page_word_count,page_sentence_count_raw,text,sentences,page_sentence_count_spacy
0,1,28,7.0,4,1,Human Nutrition 2020 Edition,[Human Nutrition 2020 Edition],1
1,2,0,0.0,1,1,,[],0
2,3,295,73.75,42,1,Human Nutrition 2020 Edition UNIVERSITY OF HAW...,[Human Nutrition 2020 Edition UNIVERSITY OF HA...,1
3,4,184,46.0,30,1,Human Nutrition 2020 Edition University Hawa...,[Human Nutrition 2020 Edition University Haw...,1
4,5,697,174.25,116,1,Contents Preface University Hawai‘i Mānoa Fo...,[Contents Preface University Hawai‘i Mānoa F...,1


In [9]:
num_of_chunk_size = 10

def split_list(input_list: list,
               slice_size: int = num_of_chunk_size) -> list[list[str]]:
    return [input_list[i: i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
print(split_list(test_list))    

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24]]


In [10]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list = item["sentences"],
                                         slice_size = num_of_chunk_size)

    item["page_sentence_count_split"] = len(item["sentence_chunks"])

random.sample(pages_and_texts, k=2)

100%|██████████| 1208/1208 [00:00<00:00, 895496.51it/s]


[{'page_number': 577,
  'page_char_count': 594,
  'page_token_count': 148.5,
  'page_word_count': 107,
  'page_sentence_count_raw': 1,
  'text': 'Age Group RDA mcgday UL mcgday Infant 0–6 months 10 25 Infants 6–12 months 10 25 Children 1–3 years 15 50 Children 4–8 years 15 50 Children 9–13 years 15 50 Adolescents 14–18 years 15 50 Adults 19–71 years 15 50 Adults 71 years 20 50 denotes Adequate Intake Source Ross A C et al 2011 The 2011 Report  Dietary Reference Intakes  Calcium  Vitamin D   Institute  Medicine What Clinicians Need  Know Journal  Clinical Endocrinology Metabolism 961 53–8 cbinlmnihgovpubmed 21118827 Accessed October 10 2017 Dietary Sources  Vitamin D Table 95 Vitamin D Content  Various Foods FatSoluble Vitamins 535',
  'sentences': ['Age Group RDA mcgday UL mcgday Infant 0–6 months 10 25 Infants 6–12 months 10 25 Children 1–3 years 15 50 Children 4–8 years 15 50 Children 9–13 years 15 50 Adolescents 14–18 years 15 50 Adults 19–71 years 15 50 Adults 71 years 20 50 denote

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_token_count,page_word_count,page_sentence_count_raw,page_sentence_count_spacy,page_sentence_count_split
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,926.29,231.57,170.35,1.0,0.98,0.98
std,348.86,451.54,112.88,86.18,0.0,0.15,0.15
min,1.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,302.75,613.75,153.44,107.75,1.0,1.0,1.0
50%,604.5,996.0,249.0,181.0,1.0,1.0,1.0
75%,906.25,1294.25,323.56,236.0,1.0,1.0,1.0
max,1208.0,1833.0,458.25,392.0,1.0,1.0,1.0


In [12]:
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r".([A-Z])", r". \1", joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 34102.10it/s]


1179

In [13]:
random.sample(pages_and_chunks, k=2)

[{'page_number': 613,
  'sentence_chunk': 'pantothenic acid listed. Table 919 . Dietary. Reference. Intakes. Pantothenic. Acid “. Table 919. Dietary. Reference. Intakes. Pantothenic. Acid. Age. Group. AI. Males. Females mgday. Infants 0–6 months 17. Infants 7–12 months 18. Children 1–3 years 2. Children 4–8 years 3. Children 9–13 years 4. Adolescents 14–18 years 5. Adults 19 years 5. Micronutrient. Information. Center. Pantothenic. Acid. Oregon. State. University. Linus. Pauling. Institute lpioregonstateedumic vitaminspatothenicacid. Updated. July 2013. Accessed. October 22 2017. Dietary. Sources. Pantothenic. Acid widely distributed  types food   deficiency  nutrient rare. Pantothenic. Acid gets name  greek word “pantothen” means “from everywhere”. For pantothenic acid content various foods see. Table 920. Pantothenic. Acid. Content. Various. Foods”. Table 920. Pantothenic. Acid. Content. Various. Foods. Wate. Soluble. Vitamins 571',
  'chunk_char_count': 903,
  'chunk_token_count': 2

In [14]:
pages_and_chunks_df = pd.DataFrame(pages_and_chunks)
pages_and_chunks_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_token_count,chunk_word_count
0,1,Human. Nutrition 2020. Edition,30,7.5,4
1,3,Human. Nutrition 2020. Edition. U. I. E. S. TY...,422,105.5,129
2,4,Human. Nutrition 2020. Edition. University. Ha...,192,48.0,23
3,5,Contents. Preface. University. Hawai‘i. Mānoa....,751,187.75,89
4,6,Lifestyles. Nutrition. University. Hawai‘i. Mā...,920,230.0,111


In [15]:
min_token_len = 30

for row in pages_and_chunks_df[pages_and_chunks_df["chunk_token_count"] <= 30].sample(5).iterrows():
    print(f"Token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}.")

Token count: 3.75 | Text: Introduction 61.
Token count: 18.5 | Text: view online pressbooksoerhawaiiedu humannutrition2p354. Phytochemicals 605.
Token count: 19.75 | Text: . A. T. X. C. A. T. R 10. M. J. R. M. N. R. LS. Chapter 10. Major. Minerals 607.
Token count: 18.0 | Text: view online pressbooksoerhawaiiedu humannutrition2p130. Introduction 149.
Token count: 13.75 | Text: pressbooksoerhawaiiedu humannutrition2p364 630. Calcium.


In [16]:
pages_and_chunks_df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_token_count,chunk_word_count
count,1179.0,1179.0,1179.0,1179.0
mean,603.27,946.0,236.5,138.98
std,348.9,421.14,105.28,61.78
min,1.0,14.0,3.5,2.0
25%,300.5,642.5,160.62,95.0
50%,603.0,1018.0,254.5,150.0
75%,903.5,1285.0,321.25,187.0
max,1208.0,1786.0,446.5,286.0


In [17]:
min_token_len = 30

pages_and_chunks_over_min_token_len = pages_and_chunks_df[pages_and_chunks_df["chunk_token_count"] > min_token_len].to_dict(orient='records')

In [18]:
random.sample(pages_and_chunks_over_min_token_len, k=2)

[{'page_number': 113,
  'sentence_chunk': 'consensus probiotics ward viralinduced diarrhea reduce symptoms lactose intolerance1. Expert nutritionists agree  health benefits pre probiotics likely reach scientific consensus. As fields pre probiotic manufacturing  clinical study progress information proper dosing  exact strains bacteria potentially “friendly” become available. You may interested trying  foods  diet. A simple food try kefir. Several websites provide good recipes including efirnet recipeshtm. Kefir dairy product fermented probiotic bacteria make pleasant tasting milkshake. Figure 25. The. Human. Digestive. System 1. Farnworth. ER 2008. The. Evidence. Support. Health. Claims. Probiotics. Journal. Nutrition 1386 125. S–. S jnnutritionorgcontent1386 125. Slong. The. Digestive. System 71',
  'chunk_char_count': 763,
  'chunk_token_count': 190.75,
  'chunk_word_count': 101},
 {'page_number': 426,
  'sentence_chunk': 'Collagen. Triple. Helix. Nevit. Dilmen. CC. B. SA 30. More one

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2",
                                      device = device)

cuda


In [20]:
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

random.sample(pages_and_chunks_over_min_token_len, k=2)

100%|██████████| 1134/1134 [00:16<00:00, 67.59it/s]


[{'page_number': 275,
  'sentence_chunk': 'Circles indicate structural differences  three bound together make  sugar found milk. It later freed  digestion process. Fructose also  chemical formula glucose differs  chemical structure. The fructose ring contains 4 carbons  glucose ring contains 5 carbons. Fructose contrast glucose  energy source  cells  body. Mostly found fruits honey sugarcane fructose one  common monosaccharides nature. It also found soft drinks cereals  products sweetened high fructose corn syrup. Figure 42. Structures . Three. Most. Common. Monosaccharides. Glucose. Galactose. Fructose. Pentoses less common monosaccharides  five carbons  six. The pentoses abundant  nucleic acids. R. A. D. A also components fiber. Lastly  sugar alcohols  industrially synthesized derivatives monosaccharides. Some examples sugar alcohols sorbitol xylitol glycerol. Xylitol similar sweetness table sugar. Sugar alcohols often used place table sugar sweeten foods  incompletely digested absor

In [21]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [22]:
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, 
                                               device=device,
                                               convert_to_tensor=True) 

text_chunk_embeddings

tensor([[ 0.0668,  0.0229,  0.0166,  ..., -0.0021, -0.0546, -0.0022],
        [ 0.0484,  0.0214, -0.0070,  ..., -0.0172, -0.0287,  0.0197],
        [ 0.0364, -0.0190, -0.0139,  ...,  0.0118,  0.0028,  0.0305],
        ...,
        [ 0.0344, -0.0114, -0.0124,  ...,  0.0021, -0.0440, -0.0309],
        [ 0.0700,  0.0296, -0.0110,  ..., -0.0247, -0.0540, -0.0272],
        [ 0.0225, -0.0430, -0.0185,  ..., -0.0338, -0.0545, -0.0317]],
       device='cuda:0')

In [24]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [25]:
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")
text_chunks_and_embeddings_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_token_count,chunk_word_count,embedding
0,3,Human. Nutrition 2020. Edition. U. I. E. S. TY...,422,105.5,129,[ 6.68388307e-02 2.29047872e-02 1.65519025e-...
1,4,Human. Nutrition 2020. Edition. University. Ha...,192,48.0,23,[ 4.83906306e-02 2.14019418e-02 -6.97510410e-...
2,5,Contents. Preface. University. Hawai‘i. Mānoa....,751,187.75,89,[ 3.64077948e-02 -1.90267637e-02 -1.39340963e-...
3,6,Lifestyles. Nutrition. University. Hawai‘i. Mā...,920,230.0,111,[ 7.47999698e-02 4.42183064e-03 -5.62939234e-...
4,7,The. Cardiovascular. System. University. Hawai...,997,249.25,120,[ 4.63126265e-02 -2.90485881e-02 1.69641748e-...


In [26]:
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1134, 768])

In [27]:
text_chunks_and_embeddings_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_token_count,chunk_word_count,embedding
0,3,Human. Nutrition 2020. Edition. U. I. E. S. TY...,422,105.5,129,"[0.0668388307, 0.0229047872, 0.0165519025, -0...."
1,4,Human. Nutrition 2020. Edition. University. Ha...,192,48.0,23,"[0.0483906306, 0.0214019418, -0.0069751041, -0..."
2,5,Contents. Preface. University. Hawai‘i. Mānoa....,751,187.75,89,"[0.0364077948, -0.0190267637, -0.0139340963, -..."
3,6,Lifestyles. Nutrition. University. Hawai‘i. Mā...,920,230.0,111,"[0.0747999698, 0.00442183064, -0.00562939234, ..."
4,7,The. Cardiovascular. System. University. Hawai...,997,249.25,120,"[0.0463126265, -0.0290485881, 0.0169641748, -0..."


In [28]:
def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [29]:
### Definition for cosine computation

def dot_product(v1, v2):
    return torch.dot(v1, v2)

def cosine_similarity(v1, v2):
    dot_product = torch.dot(v1, v2)

    norm_v1 = torch.sqrt(torch.sum(v1**2))
    norm_v2 = torch.sqrt(torch.sum(v2**2))

    return dot_product / (norm_v1 * norm_v2)

In [30]:
### Definition to retrieve the answers which has the best score and indicies of those.
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                returned_results: int=5,
                                print_time: bool=True):
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    
    start_time = time.time()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = time.time()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=returned_results)

    return scores, indices

### Print the answers
def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 returned_results: int=5):
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  returned_results=returned_results)
    
    print(f"Query: {query}\n")
    print("Results:")
    
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [31]:
query = "symptoms of pellagra"

scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 1134 embeddings: 0.00025 seconds.


(tensor([0.3905, 0.2788, 0.2767, 0.2642, 0.2573], device='cuda:0'),
 tensor([ 594,  567,  900, 1060,  951], device='cuda:0'))

In [32]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 1134 embeddings: 0.00005 seconds.
Query: symptoms of pellagra

Results:
Score: 0.3905
car. Does drive faster  halftank gas  full one. It  matter car drives  fast long
gas. Similarly depletion. B vitamins cause problems energy metabolism   required
run metabolism  speed . Buyers. Bvitamin supplements beware. B vitamins  stored
body  excess  flushed  toilet along  extra money spent. B vitamins naturally
present numerous foods many foods enriched . In. United. States. Bvitamin
deficiencies rare however  nineteenth century vitami. B deficiencies plagued
many people. North. America. Niacin deficiency also known pellagra prominent
poorer. Americans whose main dietary staple refined cornmeal. Its symptoms
severe included diarrhea dermatitis dementia even death. Some  health
consequences pellagra  result niacin  insufficient supply support body’s
metabolic functions. Learning. Activities. Technology. Note. The second edition
. Human. Nutrition. Open. Educatio

In [33]:
query = "symptoms of obesity"

scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 1134 embeddings: 0.00006 seconds.


(tensor([0.5171, 0.4752, 0.4633, 0.4494, 0.4439], device='cuda:0'),
 tensor([504, 505, 503, 881, 481], device='cuda:0'))

In [34]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 1134 embeddings: 0.00006 seconds.
Query: symptoms of obesity

Results:
Score: 0.5171
Health. Risks. Being. Overweight. Being. Obese. The health consequences obesity
great contribute  one hundred thousand deaths per year . United. States.
According . C. C . United. States 201320146 • 379 adults age twenty years  obese
• 707 adults age twenty years  overweight including obese • 206 adolescents age
twelve nineteen years obese • 174 children age six eleven years obese • 94
children age two five years obese 6. Obesity. Overweigh. The. Centers. Disease.
Control. Prevention dcgovnchsfastats obesityoverweighthtm. Updated. May 3 2017.
Accessed. June 19 2017. Factors. Affecting. Energy. Expenditure 497
Page number: 539


Score: 0.4752
. S state map obesity prevalence w wwcdcgo vobesity data prevalence mapsht ml.
State. Map . Prevalence. Obesity. America. Visit dcgovobesitydataprevalence
mapshtml see prevalence selfreported obesity among. US adults 20142016. As.

In [35]:
query = "symptoms of cancer"

scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

scores, indices

[INFO] Time taken to get scores on 1134 embeddings: 0.00006 seconds.


(tensor([0.4014, 0.3835, 0.3648, 0.3511, 0.3112], device='cuda:0'),
 tensor([1077, 1075, 1076, 1078,  530], device='cuda:0'))

In [36]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 1134 embeddings: 0.00009 seconds.
Query: symptoms of cancer

Results:
Score: 0.4014
Unmodifiable. Risk. Factors. Modifiable. Risk. Factors •. Age. Most cancers
occur people  age sixtyfive. However people  ages including children get cancer
•. Family history. Certain types cancer  genetic link. However environmental
factors may also play part •. Tobacco. Smoking chewing tobacco greatly increases
risk certain cancers including cancer  lungs bladder cervix kidneys mouth
pancreas •. Alcohol. Drinking alcohol linked cancers  mouth throat esophagus
breast well  cancers  neck head •. Obesity. Linked cancers  colon uterus
pancreas esophagus kidney breast •. Cooking techniques. Grilling smoking
preparing meat high temperatures forms carcinogens •. Red meat. The risk colon
cancer seems increase  consumption red meat processed meat •. Cured meats.
According  recent study  mild risk pancreatic cancer  consumption cured meats
sausage pepperoni bacon ham smoked tur