# Scrap the page

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://www.mayoclinic.org/diseases-conditions/rheumatoid-arthritis/symptoms-causes/syc-20353648"
response = requests.get(url)

if response.status_code != 200:
    print("Failed to retrieve the webpage")
    exit()

soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text()

# Preprocess the text

In [None]:
# Remove empty lines
text = "\n".join(item for item in text.split("\n") if item)

# Remove extra spaces
text = "\n".join(
    " ".join(word for word in line.split() if word) for line in text.splitlines()
)

# Extract entities

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_sci_sm")
doc = nlp(text)
matcher = Matcher(nlp.vocab)
patterns = {
    "SYMPTOM": [
        [{"LOWER": "tender"}, {"LOWER": "warm"}, {"LOWER": "swollen"}, {"LOWER": "joints"}],
        [{"LOWER": "joint"}, {"LOWER": "stiffness"}],
        [{"LOWER": "fatigue"}],
        [{"LOWER": "fever"}],
        [{"LOWER": "loss"}, {"LOWER": "of"}, {"LOWER": "appetite"}],
    ],
    "TREATMENT": [
        [{"LOWER": "methotrexate"}],
        [{"LOWER": "steroids"}],
    ],
    "DISEASE": [
        [{"LOWER": "rheumatoid"}, {"LOWER": "arthritis"}],
    ],
    "GENE_PROTEIN": [
        [{"LOWER": "antibodies"}],
        [{"LOWER": "synovium"}],
    ],
    "CAUSE": [
        [{"LOWER": "rheumatoid"}, {"LOWER": "arthritis"}, {"LOWER": "is"}, {"LOWER": "an"}, {"LOWER": "autoimmune"}, {"LOWER": "disease"}],
        [{"LOWER": "immune"}, {"LOWER": "system"}, {"LOWER": "attacks"}, {"LOWER": "healthy"}, {"LOWER": "tissue"}],
        [{"LOWER": "genetic"}, {"LOWER": "component"}],
        [{"LOWER": "infection"}, {"LOWER": "with"}, {"LOWER": "certain"}, {"LOWER": "viruses"}, {"LOWER": "and"}, {"LOWER": "bacteria"}],
    ],
    "RISK_FACTOR": [
        [{"LOWER": "women"}],
        [{"LOWER": "middle"}, {"LOWER": "age"}],
        [{"LOWER": "family"}, {"LOWER": "history"}],
        [{"LOWER": "smoking"}],
        [{"LOWER": "excess"}, {"LOWER": "weight"}],
    ],
    "COMPLICATIONS": [
        [{"LOWER": "osteoporosis"}],
        [{"LOWER": "rheumatoid"}, {"LOWER": "nodules"}],
        [{"LOWER": "dry"}, {"LOWER": "eyes"}, {"LOWER": "and"}, {"LOWER": "mouth"}],
        [{"LOWER": "infections"}],
        [{"LOWER": "abnormal"}, {"LOWER": "body"}, {"LOWER": "composition"}],
        [{"LOWER": "carpal"}, {"LOWER": "tunnel"}, {"LOWER": "syndrome"}],
        [{"LOWER": "heart"}, {"LOWER": "problems"}],
        [{"LOWER": "lung"}, {"LOWER": "disease"}],
        [{"LOWER": "lymphoma"}],
    ],
}

# Add patterns to matcher
for label, pattern_list in patterns.items():
    matcher.add(label, pattern_list)

# Match patterns in the text
matches = matcher(doc)

# Organize matches into categories
categories = {label: [] for label in patterns.keys()}

for match_id, start, end in matches:
    span = doc[start:end]
    label = nlp.vocab.strings[match_id]
    categories[label].append(span.text.lower())

# Remove duplicates and display results
for label, items in categories.items():
    categories[label] = list(set(items))

print("Diseases:", categories["DISEASE"])
print("Symptoms:", categories["SYMPTOM"])
print("Genes/Proteins:", categories["GENE_PROTEIN"])
print("Treatments:", categories["TREATMENT"])
print("Causes:", categories["CAUSE"])
print("Risk Factors:", categories["RISK_FACTOR"])
print("Complications:", categories["COMPLICATIONS"])

# Display the entities

In [None]:
summary = {label: items for label, items in categories.items() if items}
for label, items in summary.items():
    print(f"{label}:")
    for item in items:
        print(f"  - {item}")

# Visualize the entities and their relationships

In [None]:
from spacy import displacy

displacy.render(doc[:9], style="dep", jupyter=True)

In [None]:
displacy.render(doc, style="ent", jupyter=True)