In [4]:
import time
import os
import csv
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from IPython.display import clear_output

In [5]:

def data_extractor(base_url, retries=3, delay=5):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    diagnosis_treatment_link = ""
    doctors_departments_link = ""

    for attempt in range(retries):
        try:
            response = requests.get(base_url, headers=headers, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract Diagnosis & Treatment link
            content1 = soup.find('a', id="et_genericNavigation_diagnosis-treatment")
            if not content1:
                # fallback: search by link text containing both words
                for a in soup.find_all('a'):
                    link_text = a.get_text(separator=' ').strip().lower()
                    if "diagnosis" in link_text and "treatment" in link_text:
                        content1 = a
                        break
            if content1:
                href1 = content1.get('href')
                diagnosis_treatment_link = f"https://www.mayoclinic.org{href1}" if href1 and href1.startswith("/") else href1

            # Extract Doctors & Departments link
            content2 = soup.find('a', id="et_genericNavigation_doctors-departments")
            if not content2:
                # fallback: search by link text containing both words
                for a in soup.find_all('a'):
                    link_text = a.get_text(separator=' ').strip().lower()
                    if "doctors" in link_text and "departments" in link_text:
                        content2 = a
                        break
            if content2:
                href2 = content2.get('href')
                doctors_departments_link = f"https://www.mayoclinic.org{href2}" if href2 and href2.startswith("/") else href2

            break  # success, exit retry loop

        except requests.exceptions.RequestException as e:
            print(f"[Attempt {attempt + 1}] Error fetching {base_url}: {e}")
            if attempt < retries - 1:
                time.sleep(delay)

    return diagnosis_treatment_link, doctors_departments_link

def web_scraping(base_url):
    # Define the expected headers in order
    expected_headers = ["disease", "main_link", "Diagnosis_treatment_link", "Doctors_departments_link"]
    
    # Check if file exists and read existing headers if it does
    file_exists = os.path.isfile("mayo_diseases.csv")
    existing_headers = []
    
    if file_exists:
        with open("mayo_diseases.csv", "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            existing_headers = next(reader, [])
    
    # Determine if we need to write headers
    write_headers = not file_exists or existing_headers != expected_headers
    
    # Get the webpage content
    response = requests.get(base_url)
    if response.status_code != 200:
        print("Failed to retrieve page")
        exit()

    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.select(".cmp-results-with-primary-name__see-link, .cmp-results-with-primary-name a")

    with open("mayo_diseases.csv", "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        
        # Write headers if needed
        if write_headers:
            writer.writerow(expected_headers)
        
        for item in tqdm(items, desc="Scraping Diseases"):
            disease_name = item.text.strip()
            main_link = f"https://www.mayoclinic.org{item['href']}" if item['href'].startswith("/") else item['href']

            link1, link2 = data_extractor(main_link)
            
            # Create a row with all expected columns
            row_data = {
                "disease": disease_name,
                "main_link": main_link,
                "Diagnosis_treatment_link": link1,
                "Doctors_departments_link": link2
            }
            
            # If appending to existing file with different headers, align data with existing headers
            if file_exists and existing_headers:
                row = [row_data.get(header, "") for header in existing_headers]
            else:
                row = [row_data[header] for header in expected_headers]
            
            writer.writerow(row)

    print("Scraping Completed! Data Saved")

# Example usage:
# web_scraping("https://www.mayoclinic.org/diseases-conditions")

In [6]:
base_url = f"https://www.mayoclinic.org/diseases-conditions/index?letter=A"
web_scraping(base_url) 


Scraping Diseases: 100%|██████████| 132/132 [00:48<00:00,  2.75it/s]

Scraping Completed! Data Saved





In [20]:
from bs4 import BeautifulSoup


base_url="https://www.mayoclinic.org/diseases-conditions/egg-allergy/symptoms-causes/syc-20372115"
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
response = requests.get(base_url, headers=headers, timeout=20)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
extraction_list = ["overview","symptoms","when-to-see-a-doctor","causes","complications","prevention","risk-factors"]

for i in extraction_list:
# Find the overview section by its aria-labelledby attribute
   overview_section = soup.find('section', {'aria-labelledby': i})

# Extract all paragraph text from the overview content
   overview_content = overview_section.find('div', class_='cmp-text__rich-content')
   overview_paragraphs = [p.get_text() for p in overview_content.find_all('p')]

# Join paragraphs into a single string
   overview_text = '\n\n'.join(overview_paragraphs)
   print(i)
   print(overview_text)

overview
Eggs are one of the most common allergy-causing foods for children.

Egg allergy symptoms usually occur a few minutes to a few hours after eating eggs or foods containing eggs. Signs and symptoms range from mild to severe and can include skin rashes, hives, nasal congestion, and vomiting or other digestive problems. Rarely, egg allergy can cause anaphylaxis — a life-threatening reaction.

Egg allergy can occur as early as infancy. Most children, but not all, outgrow their egg allergy before adolescence.
symptoms
Egg allergy reactions vary from person to person and usually occur soon after exposure to egg. Egg allergy symptoms can include:
when-to-see-a-doctor
See a doctor if you or your child has signs or symptoms of a food allergy shortly after eating eggs or an egg-containing product. If possible, see the doctor when the allergic reaction is occurring. This may help in making a diagnosis.

If you or your child has signs and symptoms of anaphylaxis, seek immediate emergency t

In [37]:
from bs4 import BeautifulSoup

def extract_overview(html_content):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
    response = requests.get(base_url, headers=headers, timeout=20)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the Overview section - looking for h2 with text "Overview"
    overview_header = soup.find(lambda tag: tag.name == 'h2' and 'when-to-see-a-doctor"' in tag.text)
    
    if not overview_header:
        return "Overview section not found"
    
    # Initialize overview paragraphs
    overview_paragraphs = []
    
    # Get all siblings after the h2 until the next h2
    for sibling in overview_header.find_next_siblings():
        if sibling.name == 'h2':
            break  # Stop when we reach the next section
        if sibling.name == 'p':
            overview_paragraphs.append(sibling.get_text(strip=True))
    
    if not overview_paragraphs:
        return "No overview content found"
    
    return '\n\n'.join(overview_paragraphs)

# Example usage:
base_url="https://www.mayoclinic.org/diseases-conditions/hyperhidrosis/symptoms-causes/syc-20367152"

html_content = """ (your HTML content here) """
overview_text = extract_overview(base_url)
print(overview_text)

Overview section not found


In [43]:
from bs4 import BeautifulSoup
import requests

def extract_mayo_clinic_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        main_content = soup.find('div', class_='content')
        if not main_content:
            return {"error": "Main content not found"}
        
        sections = {
            "Overview": None,
            "Symptoms": None,
            "When to see a doctor": None,
            "Causes": None,
            "Risk factors": None,
            "Complications": None,
            "Prevention": None
        }
        
        # Find all headings (h2 and h3)
        headings = main_content.find_all(['h2', 'h3'])
        
        for heading in headings:
            heading_text = heading.get_text(strip=True)
            
            # Handle the typo "dotor" in the HTML
            if "When to see a dotor" in heading_text:
                heading_text = "When to see a doctor"
                
            if heading_text in sections:
                content = []
                next_node = heading.next_sibling
                
                # Collect content until next heading
                while next_node and next_node.name not in ['h2', 'h3']:
                    if next_node.name == 'p':
                        content.append(next_node.get_text(strip=True))
                    elif next_node.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                        content.extend(items)
                    next_node = next_node.next_sibling
                
                sections[heading_text] = '\n'.join(content) if content else "No content found"
        
        return {k: v for k, v in sections.items() if v is not None}
    
    except requests.exceptions.RequestException as e:
        return {"error": f"Failed to fetch page: {str(e)}"}

# Example usage:
url = "https://www.mayoclinic.org/diseases-conditions/hyperhidrosis/symptoms-causes/syc-20367152"
sections = extract_mayo_clinic_sections(url)

for section_name, content in sections.items():
    print(f"=== {section_name.upper()} ===")
    print(content)
    print("\n" + "="*50 + "\n")

=== OVERVIEW ===
Hyperhidrosis (hi-pur-hi-DROE-sis) is excessive sweating that's not always related to heat or exercise. You may sweat so much that it soaks through your clothes or drips off your hands. Heavy sweating can disrupt your day and cause social anxiety and embarrassment.
Hyperhidrosis treatment usually helps. It often begins with antiperspirants. If these don't help, you may need to try different medications and therapies. In severe cases, your health care provider may suggest surgery to remove the sweat glands or to disconnect the nerves related to producing too much sweat.
Sometimes an underlying condition may be found and treated.


=== SYMPTOMS ===
The main symptom of hyperhidrosis is heavy sweating. This goes beyond the sweating from being in a hot environment, exercising, or feeling anxious or stressed. The type of hyperhidrosis that usually affects the hands, feet, underarms or face causes at least one episode a week when you're awake. And the sweating usually happens

In [3]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try different possible main content containers
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if not main_content:
            return {}  # Don't return error; just skip

        # Section headings you want
        sections = {
            "Overview": None,
            "Symptoms": None,
            "When to see a doctor": None,
            "Causes": None,
            "Risk factors": None,
            "Complications": None,
            "Prevention": None
        }

        # Find all headings inside main content
        headings = main_content.find_all(['h2', 'h3'])
        for idx, heading in enumerate(headings):
            heading_text = heading.get_text(strip=True)
            # Fix potential typo in heading
            if "When to see a dotor" in heading_text:
                heading_text = "When to see a doctor"
            # If it's a section we care about
            if heading_text in sections:
                # Gather all paragraph and list content until next heading
                content = []
                next_node = heading.find_next_sibling()
                while next_node and next_node.name not in ['h2', 'h3']:
                    if next_node.name == 'p':
                        content.append(next_node.get_text(strip=True))
                    elif next_node.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                        content.extend(items)
                    next_node = next_node.find_next_sibling()
                if content:
                    sections[heading_text] = '\n'.join(content)
        # Only return sections with content
        return {k: v for k, v in sections.items() if v}
    except Exception as e:
        # Just return empty dict for error
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    # List of desired section headers
    section_headers = [
        'Overview', 'Symptoms', 'When to see a doctor',
        'Causes', 'Risk factors', 'Complications', 'Prevention'
    ]
    # Columns from csv plus any missing section headers
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # Process each row and update with extracted sections
    updated_rows = []
    for row in tqdm(rows):
        # Only process if main_link exists
        if 'main_link' in row and row['main_link']:
            clear_output(wait=True)
            print(f"Processing: {row.get('disease', 'Unknown')}")
            sections = extract_sections(row['main_link'])
            # Update row with whatever we could extract
            for section, content in sections.items():
                row[section] = content
        updated_rows.append(row)

    # Write back to CSV with updated headers and rows
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


100%|██████████| 132/132 [01:09<00:00,  1.90it/s]

CSV file updated successfully with 132 rows





In [7]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

# Section slugs for aria-labelledby (new design) and pretty names
SECTION_SLUGS = [
    ("overview", "Overview"),
    ("symptoms", "Symptoms"),
    ("when-to-see-a-doctor", "When to see a doctor"),
    ("causes", "Causes"),
    ("risk-factors", "Risk factors"),
    ("complications", "Complications"),
    ("prevention", "Prevention"),
]

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        result_sections = {}

        # Try method 1: Old layout (div.content or article#main-content)
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if main_content:
            headings = main_content.find_all(['h2', 'h3'])
            for heading in headings:
                heading_text = heading.get_text(strip=True)
                # Fix typo
                if "When to see a dotor" in heading_text:
                    heading_text = "When to see a doctor"
                # If it's a section we care about
                for _, section_name in SECTION_SLUGS:
                    if heading_text == section_name:
                        content = []
                        next_node = heading.find_next_sibling()
                        while next_node and next_node.name not in ['h2', 'h3']:
                            if next_node.name == 'p':
                                content.append(next_node.get_text(strip=True))
                            elif next_node.name in ['ul', 'ol']:
                                items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                                content.extend(items)
                            next_node = next_node.find_next_sibling()
                        if content:
                            result_sections[section_name] = '\n'.join(content)
        # Try method 2: New layout (section[aria-labelledby] + cmp-text__rich-content)
        for slug, section_name in SECTION_SLUGS:
            if section_name in result_sections:
                continue  # Already found by old method
            # Find aria-labelledby section
            section = soup.find('section', {'aria-labelledby': slug})
            if section:
                content_div = section.find('div', class_='cmp-text__rich-content')
                if content_div:
                    paragraphs = [p.get_text(strip=True) for p in content_div.find_all('p')]
                    if paragraphs:
                        result_sections[section_name] = '\n\n'.join(paragraphs)
        return result_sections
    except Exception as e:
        # Skip on error, return empty
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    section_headers = [section for slug, section in SECTION_SLUGS]
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # --- SINGLE tqdm bar for ALL rows ---
    updated_rows = []
    with tqdm(total=len(rows), desc="Processing diseases", unit="disease") as pbar:
        for row in rows:
            if 'main_link' in row and row['main_link']:
                clear_output(wait=True)
                # (Optional) tqdm.write() to log messages without disrupting the bar
                tqdm.write(f"Processing: {row.get('disease', 'Unknown')}")
                sections = extract_sections(row['main_link'])
                for section, content in sections.items():
                    row[section] = content
            updated_rows.append(row)
            pbar.update(1)  # Always update once per disease

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

Processing diseases:  99%|█████████▉| 131/132 [00:45<00:00,  3.71disease/s]

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


Processing diseases: 100%|██████████| 132/132 [00:46<00:00,  2.86disease/s]

CSV file updated successfully with 132 rows





In [8]:
import csv
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

# Section slugs for aria-labelledby (new design) and pretty names
SECTION_SLUGS = [
    ("diagnosis", "Diagnosis"),
    ("treatment", "Treatment"),
    ("coping-and-support", "Coping and support"),
    ("preparing-for-your-appointment", "Preparing for your appointment"),
    ("lifestyle-and-home-remedies", "Lifestyle and home remedies")
]

def extract_sections(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        result_sections = {}

        # Try method 1: Old layout (div.content or article#main-content)
        main_content = soup.find('div', class_='content') or soup.find('article', id='main-content')
        if main_content:
            headings = main_content.find_all(['h2', 'h3'])
            for heading in headings:
                heading_text = heading.get_text(strip=True)
                # Fix typo
                if "When to see a dotor" in heading_text:
                    heading_text = "When to see a doctor"
                # If it's a section we care about
                for _, section_name in SECTION_SLUGS:
                    if heading_text == section_name:
                        content = []
                        next_node = heading.find_next_sibling()
                        while next_node and next_node.name not in ['h2', 'h3']:
                            if next_node.name == 'p':
                                content.append(next_node.get_text(strip=True))
                            elif next_node.name in ['ul', 'ol']:
                                items = [li.get_text(strip=True) for li in next_node.find_all('li')]
                                content.extend(items)
                            next_node = next_node.find_next_sibling()
                        if content:
                            result_sections[section_name] = '\n'.join(content)
        # Try method 2: New layout (section[aria-labelledby] + cmp-text__rich-content)
        for slug, section_name in SECTION_SLUGS:
            if section_name in result_sections:
                continue  # Already found by old method
            # Find aria-labelledby section
            section = soup.find('section', {'aria-labelledby': slug})
            if section:
                content_div = section.find('div', class_='cmp-text__rich-content')
                if content_div:
                    paragraphs = [p.get_text(strip=True) for p in content_div.find_all('p')]
                    if paragraphs:
                        result_sections[section_name] = '\n\n'.join(paragraphs)
        return result_sections
    except Exception as e:
        # Skip on error, return empty
        return {}

def update_csv_with_sections(csv_file):
    # Read existing data and headers
    rows = []
    existing_headers = []
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_headers = reader.fieldnames
            rows = list(reader)

    section_headers = [section for slug, section in SECTION_SLUGS]
    all_headers = existing_headers.copy() if existing_headers else ['disease', 'main_link']
    for header in section_headers:
        if header not in all_headers:
            all_headers.append(header)

    # --- SINGLE tqdm bar for ALL rows ---
    updated_rows = []
    with tqdm(total=len(rows), desc="Processing diseases", unit="disease") as pbar:
        for row in rows:
            if 'main_link' in row and row['Diagnosis_treatment_link']:
                clear_output(wait=True)
                # (Optional) tqdm.write() to log messages without disrupting the bar
                tqdm.write(f"Processing: {row.get('disease', 'Unknown')}")
                sections = extract_sections(row['Diagnosis_treatment_link'])
                for section, content in sections.items():
                    row[section] = content
            updated_rows.append(row)
            pbar.update(1)  # Always update once per disease

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=all_headers)
        writer.writeheader()
        writer.writerows(updated_rows)
    print(f"CSV file updated successfully with {len(updated_rows)} rows")

# Example usage
if __name__ == "__main__":
    csv_file = "mayo_diseases.csv"
    update_csv_with_sections(csv_file)

Processing diseases:  99%|█████████▉| 131/132 [01:15<00:00,  2.60disease/s]

Processing: Atrioventricular nodal reentry tachycardia (AVNRT)


Processing diseases: 100%|██████████| 132/132 [01:16<00:00,  1.74disease/s]

CSV file updated successfully with 132 rows





In [31]:
extraction_list = ["overview","symptoms","when-to-see-a-doctor","causes","complications","prevention","risk-factors"]

In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("mayo_diseases.csv")

In [14]:
df.shape

(132, 16)

In [13]:
df.isnull().sum()

disease                            0
main_link                          0
Diagnosis_treatment_link           1
Doctors_departments_link          25
Overview                           0
Symptoms                           2
When to see a doctor              12
Causes                             2
Risk factors                       2
Complications                     22
Prevention                        52
Diagnosis                          1
Treatment                          1
Coping and support                76
Preparing for your appointment     6
Lifestyle and home remedies       81
dtype: int64

In [15]:
df.head()

Unnamed: 0,disease,main_link,Diagnosis_treatment_link,Doctors_departments_link,Overview,Symptoms,When to see a doctor,Causes,Risk factors,Complications,Prevention,Diagnosis,Treatment,Coping and support,Preparing for your appointment,Lifestyle and home remedies
0,Atrial fibrillation,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Atrial fibrillation (AFib) is an irregular and...,Symptoms ofAFibmay include:\nFeelings of a fas...,"If you have symptoms of atrial fibrillation, m...",To understand the causes of atrial fibrillatio...,Things that can increase the risk of atrial fi...,Blood clots are a dangerous complication of at...,Healthy lifestyle choices can reduce the risk ...,You may not know you have atrial fibrillation ...,The goals of atrial fibrillation treatment are...,,If you have an irregular or pounding heartbeat...,Following a heart-healthy lifestyle can help p...
1,Hyperhidrosis,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hyperhidrosis (hi-pur-hi-DROE-sis) is excessiv...,The main symptom of hyperhidrosis is heavy swe...,Sometimes excessive sweating is a sign of a se...,Sweating is the body's mechanism to cool itsel...,Risk factors for hyperhidrosis include:\nHavin...,Complications of hyperhidrosis include:\nInfec...,,Diagnosing hyperhidrosis may start with your h...,Treating hyperhidrosis may start with treating...,Hyperhidrosis can be the cause of discomfort a...,You may start by seeing your primary care prov...,The following suggestions may help control swe...
2,Bartholin's cyst,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,The Bartholin's (BAHR-toe-linz) glands are loc...,"If you have a small, noninfected Bartholin's c...",Call your doctor if you have a painful lump ne...,Experts believe that the cause of a Bartholin'...,,A Bartholin's cyst or abscess may recur and ag...,There's no way to prevent a Bartholin's cyst. ...,"To diagnose a Bartholin's cyst, your doctor ma...",Often a Bartholin's cyst requires no treatment...,,Your first appointment will likely be with eit...,
3,Infant reflux,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,Infant reflux is when a baby spits up liquid o...,"Most of the time, infant reflux isn't a cause ...",See a healthcare professional if a baby:\nIsn'...,"In infants, the ring of muscle between the eso...",Infant reflux is common. But some things make ...,Infant reflux usually gets better on its own. ...,,"To diagnose infant reflux, a healthcare profes...","For most babies, making some changes to feedin...",,You may start by seeing your baby's primary he...,To minimize reflux:\nFeed your baby in an upri...
4,Hidradenitis suppurativa,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hidradenitis suppurativa (hi-drad-uh-NIE-tis s...,Hidradenitis suppurativa can affect one or sev...,Early diagnosis of hidradenitis suppurativa is...,Hidradenitis suppurativa develops when hair fo...,Factors that increase your chance of developin...,Persistent and severe hidradenitis suppurativa...,,Hidradenitis suppurativa can be mistaken for p...,"Treatment with medicines, surgery or both can ...",Hidradenitis suppurativa can be a challenge to...,You'll likely first see your primary care prov...,Mild hidradenitis suppurativa can sometimes be...


In [22]:
df["Symptoms"][0]

"Symptoms ofAFibmay include:\nFeelings of a fast, fluttering or pounding heartbeat, called palpitations.\nChest pain.\nDizziness.\nFatigue.\nLightheadedness.\nReduced ability to exercise.\nShortness of breath.\nWeakness.\nSome people with atrial fibrillation (AFib) don't notice any symptoms.\nAtrial fibrillation may be:\nOccasional, also called paroxysmal atrial fibrillation.AFibsymptoms come and go. The symptoms usually last for a few minutes to hours. Some people have symptoms for as long as a week. The episodes can happen repeatedly. Symptoms might go away on their own. Some people with occasionalAFibneed treatment.\nPersistent.The irregular heartbeat is constant. The heart rhythm does not reset on its own. If symptoms occur, medical treatment is needed to correct the heart rhythm.\nLong-standing persistent.This type ofAFibis constant and lasts longer than 12 months. Medicines or a procedure are needed to correct the irregular heartbeat.\nPermanent.In this type of atrial fibrillatio

In [20]:
import nltk

In [21]:
symptoms  = df["Symptoms"]

In [None]:
symptoms["symptoms"] = symptoms["symptoms"].

In [23]:
import requests
import json

In [24]:
import os

# Get the API key from the environment variable
api_key = os.getenv("GROQ_API_KEY")

# Optional: Check if it's loaded
if api_key:
    print("API Key Loaded Successfully:", api_key[:10] + "*****")
else:
    print("API Key not found. Check if the environment variable is set.")


API Key Loaded Successfully: gsk_7rEOxb*****


In [25]:
url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

In [26]:
text = "Symptoms ofAFibmay include:\nFeelings of a fast, fluttering or pounding heartbeat, called palpitations.\nChest pain.\nDizziness.\nFatigue.\nLightheadedness.\nReduced ability to exercise.\nShortness of breath.\nWeakness.\nSome people with atrial fibrillation (AFib) don't notice any symptoms.\nAtrial fibrillation may be:\nOccasional, also called paroxysmal atrial fibrillation.AFibsymptoms come and go. The symptoms usually last for a few minutes to hours. Some people have symptoms for as long as a week. The episodes can happen repeatedly. Symptoms might go away on their own. Some people with occasionalAFibneed treatment.\nPersistent.The irregular heartbeat is constant. The heart rhythm does not reset on its own. If symptoms occur, medical treatment is needed to correct the heart rhythm.\nLong-standing persistent.This type ofAFibis constant and lasts longer than 12 months. Medicines or a procedure are needed to correct the irregular heartbeat.\nPermanent.In this type of atrial fibrillation, the irregular heart rhythm can't be reset. Medicines are needed to control the heart rate and to prevent blood clots."

data = {
    "model": "llama-3.3-70b-versatile",
    "messages": [
        {
            "role": "user",
            "content": f"""
You are a helpful and expert **medical assistant AI**. You read medical or health-related texts and explain them in clear, friendly, and simplified language for non-medical users.

Here is a health-related text a user might find hard to understand:

\"\"\"{text}\"\"\"

Your job is to extract and present the following:

1. A user-friendly **summary of all symptoms** mentioned in the text (bullet point format).
2. Break down and **explain any types or stages** of the condition (if present), in plain English.
3. Provide a short and clear **educational paragraph** on the condition that includes what it is, its symptoms, and when to seek medical help.
4. Output everything in **structured JSON** format with the following fields:
   - `symptoms`: A list of clear symptoms in bullet point format
   - `types`: A list explaining any forms or categories (like “Occasional”, “Persistent”, etc.)
   - `summary`: A friendly paragraph explaining the condition and what the user should know

Make sure your output is simple, empathetic, and helpful — written in a tone similar to a nurse explaining something gently to a patient.
"""
        }
    ]
}

In [29]:
response = requests.post(url, headers=headers, data=json.dumps(data))

# ✅ Print the AI response
if response.status_code == 200:
    result = response.json()
    print("AI Response:\n")
    print(result['choices'][0]['message']['content'])
else:
    print("Error:", response.status_code)
    print(response.text)

AI Response:

```json
{
  "symptoms": [
    "Feelings of a fast, fluttering or pounding heartbeat (palpitations)",
    "Chest pain",
    "Dizziness",
    "Fatigue",
    "Lightheadedness",
    "Reduced ability to exercise",
    "Shortness of breath",
    "Weakness"
  ],
  "types": [
    "Occasional (paroxysmal): Symptoms come and go, lasting from a few minutes to hours or even a week, and may go away on their own.",
    "Persistent: The irregular heartbeat is constant and doesn't reset on its own, requiring medical treatment.",
    "Long-standing persistent: This type lasts longer than 12 months and requires medicine or a procedure to correct the heartbeat.",
    "Permanent: The irregular heartbeat can't be reset, and medicine is needed to control the heart rate and prevent blood clots."
  ],
  "summary": "Atrial fibrillation, also known as AFib, is a heart condition where the heartbeat becomes irregular. It can cause a range of symptoms, including palpitations, chest pain, dizziness, a

In [2]:
import tensorflow as tf
import numpy as np
import PIL
import keras

print("tensorFlow==", tf.__version__)
print("numpy==", np.__version__)
print("pillow==", PIL.__version__)
print("keras==", keras.__version__)


tensorFlow== 2.19.0
numpy== 2.0.2
pillow== 11.0.0
keras== 3.10.0


In [39]:
import pandas as pd

In [40]:
df = pd.read_csv("mayo_diseases.csv")

In [41]:
df.head()

Unnamed: 0,disease,main_link,Diagnosis_treatment_link,Doctors_departments_link,Overview,Symptoms,When to see a doctor,Causes,Risk factors,Complications,Prevention,Diagnosis,Treatment,Coping and support,Preparing for your appointment,Lifestyle and home remedies
0,Atrial fibrillation,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Atrial fibrillation (AFib) is an irregular and...,Symptoms ofAFibmay include:\nFeelings of a fas...,"If you have symptoms of atrial fibrillation, m...",To understand the causes of atrial fibrillatio...,Things that can increase the risk of atrial fi...,Blood clots are a dangerous complication of at...,Healthy lifestyle choices can reduce the risk ...,You may not know you have atrial fibrillation ...,The goals of atrial fibrillation treatment are...,,If you have an irregular or pounding heartbeat...,Following a heart-healthy lifestyle can help p...
1,Hyperhidrosis,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hyperhidrosis (hi-pur-hi-DROE-sis) is excessiv...,The main symptom of hyperhidrosis is heavy swe...,Sometimes excessive sweating is a sign of a se...,Sweating is the body's mechanism to cool itsel...,Risk factors for hyperhidrosis include:\nHavin...,Complications of hyperhidrosis include:\nInfec...,,Diagnosing hyperhidrosis may start with your h...,Treating hyperhidrosis may start with treating...,Hyperhidrosis can be the cause of discomfort a...,You may start by seeing your primary care prov...,The following suggestions may help control swe...
2,Bartholin's cyst,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,The Bartholin's (BAHR-toe-linz) glands are loc...,"If you have a small, noninfected Bartholin's c...",Call your doctor if you have a painful lump ne...,Experts believe that the cause of a Bartholin'...,,A Bartholin's cyst or abscess may recur and ag...,There's no way to prevent a Bartholin's cyst. ...,"To diagnose a Bartholin's cyst, your doctor ma...",Often a Bartholin's cyst requires no treatment...,,Your first appointment will likely be with eit...,
3,Infant reflux,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,Infant reflux is when a baby spits up liquid o...,"Most of the time, infant reflux isn't a cause ...",See a healthcare professional if a baby:\nIsn'...,"In infants, the ring of muscle between the eso...",Infant reflux is common. But some things make ...,Infant reflux usually gets better on its own. ...,,"To diagnose infant reflux, a healthcare profes...","For most babies, making some changes to feedin...",,You may start by seeing your baby's primary he...,To minimize reflux:\nFeed your baby in an upri...
4,Hidradenitis suppurativa,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hidradenitis suppurativa (hi-drad-uh-NIE-tis s...,Hidradenitis suppurativa can affect one or sev...,Early diagnosis of hidradenitis suppurativa is...,Hidradenitis suppurativa develops when hair fo...,Factors that increase your chance of developin...,Persistent and severe hidradenitis suppurativa...,,Hidradenitis suppurativa can be mistaken for p...,"Treatment with medicines, surgery or both can ...",Hidradenitis suppurativa can be a challenge to...,You'll likely first see your primary care prov...,Mild hidradenitis suppurativa can sometimes be...


In [42]:
symptoms = df["Symptoms"]

In [46]:
sample = symptoms[2]

In [50]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

entities = ner(sample)

for e in entities:
    print(e['word'], e['entity_group'])


Device set to use cpu


non Detailed_description
cy Sign_symptom
##st Coreference
cy Sign_symptom
lump Sign_symptom
mass Sign_symptom
va Biological_structure
cyst Sign_symptom
full Detailed_description
blown Detailed_description
cy Sign_symptom
cy Sign_symptom
cyst Sign_symptom
abscess Sign_symptom


In [47]:
import re

def extract_symptoms(text):
    # Split into lines
    lines = text.split("\n")
    # Keep lines that are short symptom-like phrases
    symptoms = []
    for line in lines:
        line = line.strip()
        if line and len(line.split()) <= 10:  # filter short lines (likely symptoms)
            # Clean extra characters
            line = re.sub(r"^[•\-–\d.\s]*", "", line)  # remove bullets or numbering
            symptoms.append(line)
    return symptoms


In [48]:
extract_symptoms(sample)

[]

In [49]:
sample

"If you have a small, noninfected Bartholin's cyst, you may not notice it. If the cyst grows, you might feel a lump or mass near your vaginal opening. Although a cyst is usually painless, it can be tender.\n\nA full-blown infection of a Bartholin's cyst can occur in a matter of days. If the cyst becomes infected, you may experience:\n\nA Bartholin's cyst or abscess typically occurs on only one side of the vaginal opening."

In [2]:
import os
import sys
import pandas as pd
from tqdm import tqdm

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document


In [10]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from backend.utils.text_cleaning import Text_Preprocessing
from backend.utils.filtering_with_ner import RemoveUselessWords


In [3]:
csv_path = "mayo_diseases.csv"

df = pd.read_csv(csv_path)
print("Loaded CSV")
df.head()


Loaded CSV


Unnamed: 0,disease,main_link,Diagnosis_treatment_link,Doctors_departments_link,Overview,Symptoms,When to see a doctor,Causes,Risk factors,Complications,Prevention,Diagnosis,Treatment,Coping and support,Preparing for your appointment,Lifestyle and home remedies
0,Atrial fibrillation,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Atrial fibrillation (AFib) is an irregular and...,Symptoms ofAFibmay include:\nFeelings of a fas...,"If you have symptoms of atrial fibrillation, m...",To understand the causes of atrial fibrillatio...,Things that can increase the risk of atrial fi...,Blood clots are a dangerous complication of at...,Healthy lifestyle choices can reduce the risk ...,You may not know you have atrial fibrillation ...,The goals of atrial fibrillation treatment are...,,If you have an irregular or pounding heartbeat...,Following a heart-healthy lifestyle can help p...
1,Hyperhidrosis,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hyperhidrosis (hi-pur-hi-DROE-sis) is excessiv...,The main symptom of hyperhidrosis is heavy swe...,Sometimes excessive sweating is a sign of a se...,Sweating is the body's mechanism to cool itsel...,Risk factors for hyperhidrosis include:\nHavin...,Complications of hyperhidrosis include:\nInfec...,,Diagnosing hyperhidrosis may start with your h...,Treating hyperhidrosis may start with treating...,Hyperhidrosis can be the cause of discomfort a...,You may start by seeing your primary care prov...,The following suggestions may help control swe...
2,Bartholin's cyst,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,The Bartholin's (BAHR-toe-linz) glands are loc...,"If you have a small, noninfected Bartholin's c...",Call your doctor if you have a painful lump ne...,Experts believe that the cause of a Bartholin'...,,A Bartholin's cyst or abscess may recur and ag...,There's no way to prevent a Bartholin's cyst. ...,"To diagnose a Bartholin's cyst, your doctor ma...",Often a Bartholin's cyst requires no treatment...,,Your first appointment will likely be with eit...,
3,Infant reflux,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,,Infant reflux is when a baby spits up liquid o...,"Most of the time, infant reflux isn't a cause ...",See a healthcare professional if a baby:\nIsn'...,"In infants, the ring of muscle between the eso...",Infant reflux is common. But some things make ...,Infant reflux usually gets better on its own. ...,,"To diagnose infant reflux, a healthcare profes...","For most babies, making some changes to feedin...",,You may start by seeing your baby's primary he...,To minimize reflux:\nFeed your baby in an upri...
4,Hidradenitis suppurativa,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,https://www.mayoclinic.org/diseases-conditions...,Hidradenitis suppurativa (hi-drad-uh-NIE-tis s...,Hidradenitis suppurativa can affect one or sev...,Early diagnosis of hidradenitis suppurativa is...,Hidradenitis suppurativa develops when hair fo...,Factors that increase your chance of developin...,Persistent and severe hidradenitis suppurativa...,,Hidradenitis suppurativa can be mistaken for p...,"Treatment with medicines, surgery or both can ...",Hidradenitis suppurativa can be a challenge to...,You'll likely first see your primary care prov...,Mild hidradenitis suppurativa can sometimes be...


In [12]:
from tqdm import tqdm
tqdm.pandas(desc="🧼 Cleaning Symptoms")

# Initialize processing classes
text_cleaner = Text_Preprocessing()
ner_filter = RemoveUselessWords()

# Step 1: Clean the raw symptom text
df["symptoms_cleaned"] = df["Symptoms"].progress_apply(lambda x: text_cleaner.go_on(x))

# Step 2: Apply NER filtering to remove unimportant entities
df["symptoms_filtered"] = df["symptoms_cleaned"].progress_apply(lambda x: ner_filter.process_entities(x))

# Step 3: Convert list of symptoms to a single space-separated string (for embedding)
df["symptoms_main"] = df["symptoms_filtered"].progress_apply(lambda x: " ".join(x))

# Show preview


Device set to use cpu
🧼 Cleaning Symptoms: 100%|██████████| 132/132 [00:02<00:00, 57.72it/s]
🧼 Cleaning Symptoms: 100%|██████████| 132/132 [00:06<00:00, 21.30it/s]
🧼 Cleaning Symptoms: 100%|██████████| 132/132 [00:00<?, ?it/s]


In [15]:
from tqdm import tqdm
from langchain.schema import Document

documents = []

# Outer loop: iterate through each row (disease)
for _, row in tqdm(df.iterrows(), total=len(df), desc="📄 Creating Documents (Rows)"):
    disease = row["disease"]
    words = row["symptoms_main"].split()

    # Inner loop: iterate through each symptom keyword (word)
    for word in tqdm(words, desc=f"🧠 Embedding words for {disease}", leave=False):
        documents.append(Document(page_content=word.strip(), metadata={"disease": disease}))

print(f"✅ Total documents created: {len(documents)}")


📄 Creating Documents (Rows): 100%|██████████| 132/132 [00:00<00:00, 170.06it/s]

✅ Total documents created: 4412





In [16]:
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(documents, embedder)

save_path = "Vector/symptom_faiss_db"
vectorstore.save_local(save_path)

print(f"✅ Vector store saved at {save_path}")


  embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



✅ Vector store saved at Vector/symptom_faiss_db


In [6]:
df["Symptoms"][2]

"If you have a small, noninfected Bartholin's cyst, you may not notice it. If the cyst grows, you might feel a lump or mass near your vaginal opening. Although a cyst is usually painless, it can be tender.\n\nA full-blown infection of a Bartholin's cyst can occur in a matter of days. If the cyst becomes infected, you may experience:\n\nA Bartholin's cyst or abscess typically occurs on only one side of the vaginal opening."

In [9]:
df["disease"][1:60]

1                                         Hyperhidrosis
2                                      Bartholin's cyst
3                                         Infant reflux
4                              Hidradenitis suppurativa
5                                              HIV/AIDS
6                            Acute myelogenous leukemia
7                               Guillain-Barre syndrome
8                                   Acute kidney injury
9                            Acute lymphocytic leukemia
10                           Acute lymphocytic leukemia
11                           Acute myelogenous leukemia
12                           Acute myelogenous leukemia
13                           Acute myelogenous leukemia
14                                   Radiation sickness
15                                   Radiation sickness
16                                  Acute kidney injury
17                                                 ARDS
18                                      Acute si

In [66]:
user_input = "I'm vomiting violently with skin burns after radiation therapy"

In [47]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from backend.utils.text_cleaning import Text_Preprocessing
from backend.utils.filtering_with_ner import RemoveUselessWords
text_cleaner = Text_Preprocessing()
ner_filter = RemoveUselessWords()

Device set to use cpu


In [67]:
text = text_cleaner.go_on(user_input)
text = ner_filter.process_entities(text)
text

['vomit', 'skin', 'radiation therapy']

In [68]:
text = " ".join(text)
text

'vomit skin radiation therapy'

In [69]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# 1. Load your saved vector store
save_path = "Vector/symptom_faiss_db"
vectorstore = FAISS.load_local(
    folder_path=save_path,
    embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
    allow_dangerous_deserialization=True  # Required for loading your trusted local file
)

# 2. Your symptom text (replace with actual input)
symptom_text = text # Example input

# 3. Find top 5 matches with scores
similar_docs = vectorstore.similarity_search_with_score(symptom_text, k=15)  # Get top 5 matches

if similar_docs:
    print(f"🔍 Top 5 matches for: '{symptom_text}'\n")
    for rank, (doc, score) in enumerate(similar_docs, 1):
        print(f"🏥 Match #{rank}:")
        print(f"   Disease: {doc.metadata['disease']}")
        print(f"   Confidence: {score:.4f}")  # Higher = more similar
        print(f"   Key Symptom: {doc.page_content}\n")
else:
    print("❌ No matches found")

🔍 Top 5 matches for: 'vomit skin radiation therapy'

🏥 Match #1:
   Disease: Peanut allergy
   Confidence: 0.8991
   Key Symptom: vomiting

🏥 Match #2:
   Disease: Alcohol use disorder
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #3:
   Disease: Mesenteric lymphadenitis
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #4:
   Disease: Alcohol use disorder
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #5:
   Disease: Alcoholic hepatitis
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #6:
   Disease: Food allergy
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #7:
   Disease: Shellfish allergy
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #8:
   Disease: Viral hemorrhagic fevers
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #9:
   Disease: Viral hemorrhagic fevers
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #10:
   Disease: Bird flu (avian influenza)
   Confidence: 0.9163
   Key Symptom: vomit

🏥 Match #11:
   Disease: Radiation sickness

In [1]:
from rapidfuzz import fuzz, process

query = "I need symptoms for cold, how it works?"
columns = ["Overview", "Symptoms", "Causes", "Risk factors", "Treatment"]

best_match = process.extractOne(query, columns, scorer=fuzz.partial_ratio)
print(best_match)
# Example output: ("Symptoms", 86.0)


('Symptoms', 87.5, 1)


In [None]:
pip uninstall spacy scispacy en-ner-bc5cdr-md -y
pip install spacy==3.2.6
pip install scispacy==0.5.0
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz


In [1]:
import spacy

# Load the biomedical NER model
nlp = spacy.load("en_ner_bc5cdr_md")

def extract_symptoms(text):
    doc = nlp(text)
    symptoms = [ent.text for ent in doc.ents if ent.label_ == "DISEASE"]
    return symptoms

# Example usage
user_text = "I have been coughing a lot and feeling shortness of breath."
print(extract_symptoms(user_text))


['shortness of breath']


In [4]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return list(synonyms)


[nltk_data] Downloading package wordnet to /home/ml/nltk_data...


In [10]:
get_synonyms("breath")

['hint',
 'breathing space',
 'breath',
 'breathing place',
 'breathing spell',
 'intimation',
 'breathing time',
 'breather']

In [11]:
A = {"cough", "fever", "fatigue"}
B = {"cough", "sore throat", "fever"}

overlap = A.intersection(B)
print(overlap)          # {'cough', 'fever'}
print(len(overlap))     # 2


{'fever', 'cough'}
2


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("d4data/biobert_ner")
model = AutoModelForTokenClassification.from_pretrained("d4data/biobert_ner")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

text = "The patient has cough and high fever."
symptoms = [ent['word'] for ent in nlp(text) if ent['entity'].startswith('B-')]


  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/ml/Documents/SmartHealth-LLM/myenv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ml/Documents/SmartHealth-LLM/myenv/lib/python3.9/site-packages/traitlets/config/application.py", line 1075

OSError: d4data/biobert_ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")


In [2]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = groq_api_key

from langchain.chat_models import init_chat_model

model = init_chat_model("llama-3.3-70b-versatile", model_provider="groq")

In [None]:
model.invoke("")

AIMessage(content='The name "Harmesh" is of Indian origin, specifically from the Punjabi and Hindi languages. It is a masculine given name that is composed of two words: "Har" and "Mesh."\n\n"Har" is a common prefix in many Indian names, and it is derived from the Sanskrit word "Hara," which means "lord" or "God." In Hindu mythology, "Har" is also another name for Lord Shiva.\n\n"Mesh" is derived from the Sanskrit word "Mesa," which means "ram" or "Aries" (the zodiac sign). In Indian astrology, "Mesh" is the first sign of the zodiac, representing strength, courage, and leadership.\n\nTogether, the name "Harmesh" can be interpreted to mean "Lord of the Ram" or "God of Aries." It is a name that symbolizes strength, courage, and leadership, and is often associated with qualities such as confidence, determination, and adventurousness.\n\nIn Indian culture, the name "Harmesh" is often given to boys born under the sign of Aries, which is considered a strong and energetic sign. The name is al