# Scraping

this code provide how to scrap the [D&D 5e Wiki](https://dnd-5e.fandom.com/wiki/D%26D_5e_Wiki)

In [25]:
import requests
import re
import os
import string
import pandas as pd
import markdownify 

from io import StringIO
from bs4 import BeautifulSoup, Comment
from tqdm import tqdm
from IPython.display import display, clear_output

In [2]:
url = requests.get('https://dnd-5e.fandom.com/wiki/List_of_Spells')
soup = BeautifulSoup(url.content, 'html.parser')

# Spell

filter out `Unearthed Arcana`

In [3]:
data = soup.find_all('span', 'mw-headline')
filtered_data = [item for item in data if item.text != 'Unearthed Arcana' and item.text != 'Spell Navigation']

In [4]:
filtered_data

[<span class="mw-headline" id="Cantrips">Cantrips</span>,
 <span class="mw-headline" id="1st_Level">1st Level</span>,
 <span class="mw-headline" id="2nd_Level">2nd Level</span>,
 <span class="mw-headline" id="3rd_Level">3rd Level</span>,
 <span class="mw-headline" id="4th_Level">4th Level</span>,
 <span class="mw-headline" id="5th_Level">5th Level</span>,
 <span class="mw-headline" id="6th_Level">6th Level</span>,
 <span class="mw-headline" id="7th_Level">7th Level</span>,
 <span class="mw-headline" id="8th_Level">8th Level</span>,
 <span class="mw-headline" id="9th_Level">9th Level</span>]

In [5]:
dfs = []

for table in filtered_data:
    table = table.find_next('table')
    html_table = list()
    
    if table:
        html_string = str(table)
        html_table.append(table)
        df = pd.read_html(StringIO(html_string))[0]
        dfs.append(df)

print(len(dfs))
display(dfs[0].head())
display(dfs[1].head())

10


Unnamed: 0,Name,School,Casting Time,Comps,Save,Conc.,Source
0,Acid Splash,Conjuration,1 action,"V, S",Dexterity,,PHB 211
1,Blade Ward,Abjuration,1 action,"V, S",,,PHB 218
2,Booming Blade,Evocation,1 action,"V, M",Attack roll,,"TCE 143, SCAG 142"
3,Chill Touch,Necromancy,1 action,"V, S",Attack roll,,PHB 221
4,Control Flames,Transmutation,1 action,S,,,"XGE 152, EEPC 16"


Unnamed: 0,Name,School,Casting Time,Comps,Cost,Save,Ritual,Conc.,Source
0,Absorb Elements,Abjuration,1 reaction,S,,,,,"XGE 150, EEPC 15"
1,Alarm,Abjuration,1 minute,"V, S, M",,,Yes,,PHB 211
2,Animal Friendship,Enchantment,1 action,"V, S, M",,Wisdom,,,PHB 212
3,Armor of Agathys,Abjuration,1 action,"V, S, M",,,,,PHB 215
4,Arms of Hadar,Conjuration,1 action,"V, S",,Strength,,,PHB 215


In [6]:
os.makedirs("../datasets/spell/", exist_ok=True)

for title, df in zip(filtered_data, dfs):
    df.to_csv(f"../datasets/spell/{title.text}.csv", index=False)

In [7]:
filtered_data

[<span class="mw-headline" id="Cantrips">Cantrips</span>,
 <span class="mw-headline" id="1st_Level">1st Level</span>,
 <span class="mw-headline" id="2nd_Level">2nd Level</span>,
 <span class="mw-headline" id="3rd_Level">3rd Level</span>,
 <span class="mw-headline" id="4th_Level">4th Level</span>,
 <span class="mw-headline" id="5th_Level">5th Level</span>,
 <span class="mw-headline" id="6th_Level">6th Level</span>,
 <span class="mw-headline" id="7th_Level">7th Level</span>,
 <span class="mw-headline" id="8th_Level">8th Level</span>,
 <span class="mw-headline" id="9th_Level">9th Level</span>]

In [None]:
data = soup.find_all('span', 'mw-headline')
filtered_data = [item for item in data if item.text != 'Unearthed Arcana' and item.text != 'Spell Navigation']  
os.makedirs("../datasets/spell_content", exist_ok=True)

for table in filtered_data:
    file_name = table.text
    table = table.find_next('table')
    data = list()

    if table:
        anchors = table.find_all('a')
        
        for anchor in tqdm(anchors):
            url = requests.get(f'https://dnd-5e.fandom.com{anchor['href']}')
            soup = BeautifulSoup(url.content, 'html.parser')
            name_tag = soup.find('span', class_='mw-page-title-main')

            h2_tags = soup.find_all('h2')

            for h2_tag in h2_tags:
                headline = h2_tag.find('span', class_='mw-headline')
                if headline:
                    new_h2 = soup.new_tag("h2")
                    new_h2.string = headline.text.strip()
                    h2_tag.replace_with(new_h2)

            content_block = soup.find('div', 'mw-content-ltr mw-parser-output')
            content_children = content_block.findChildren()
            cleaned_content = []

            for element in content_children:
                markdown = markdownify.markdownify(str(element), strip=['a', 'span'], heading_style="ATX")
                cleaned_content.append(markdown)

            combined_content = f"# {name_tag.text.strip()}\n## Spell Name\n{name_tag.text.strip()}\n\n" + "\n".join(cleaned_content) + "\n"
            combined_content = re.sub(r'<!--.*?-->', '', combined_content, flags=re.DOTALL)

            data.append(combined_content)

        with open(f"../datasets/spell_content/{file_name}.txt", "w", encoding="utf-8") as f:
            for item in data:
                f.write(f"{item}\n")

  content_children = content_block.findChildren()
100%|██████████| 46/46 [00:22<00:00,  2.08it/s]
100%|██████████| 79/79 [00:32<00:00,  2.46it/s]
100%|██████████| 85/85 [00:39<00:00,  2.14it/s]
100%|██████████| 73/73 [00:33<00:00,  2.21it/s]
100%|██████████| 51/51 [00:19<00:00,  2.60it/s]
100%|██████████| 61/61 [00:26<00:00,  2.29it/s]
100%|██████████| 48/48 [00:21<00:00,  2.25it/s]
100%|██████████| 28/28 [00:11<00:00,  2.41it/s]
100%|██████████| 24/24 [00:11<00:00,  2.14it/s]
100%|██████████| 22/22 [00:11<00:00,  1.95it/s]


In [None]:
url = requests.get('https://dnd-5e.fandom.com/wiki/List_of_Spells')
soup = BeautifulSoup(url.content, 'html.parser')

In [32]:
def content(headers, category_url, base_url, output_dir):
    # Set to store unique URLs
    monster_links = dict()

    # Phase 1: Collect all unique monster links
    print("Collecting all unique monster links...")
    for letter in string.ascii_uppercase:
        print(f"Checking: {letter}")
        monster_set = set()
        url = category_url + letter
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch page for letter {letter}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.select('div.category-page__members a.category-page__member-link')

        for link in tqdm(links):
            href = link['href']
            full_url = f"{base_url}{href}"
            monster_set.add(full_url)
        
        monster_links[letter] = monster_set

    print(f"\nFinish")
    
    clear_output(wait=True)
    # Phase 2: Scrape each unique monster page and save as Markdown

    for letter, monster_set in sorted(monster_links.items()):
        markdown_text = ""

        for monster_url in tqdm(sorted(monster_set)):
            monster_resp = requests.get(monster_url, headers=headers)
            if monster_resp.status_code != 200:
                print(f"Failed to fetch {monster_url}")
                continue

            monster_soup = BeautifulSoup(monster_resp.text, 'html.parser')
            main_content = monster_soup.find('main', class_='page__main')
            if not main_content:
                print(f"Skipped (no main content): {monster_url}")
                continue

            # Remove unwanted elements
            for cls in ['page-side-tools__wrapper', 'page-header__actions', 'license-description', 'page-header__categories']:
                for tag in main_content.find_all('div', class_=cls):
                    tag.decompose()

            # Extract name from the page
            name_tag = monster_soup.select_one("span.mw-page-title-main")
            name = name_tag.text.strip() if name_tag else "Unknown Monster"

            # Convert to markdown
            markdown = markdownify.markdownify(str(main_content), strip=['a', 'span'], heading_style="ATX")
            markdown_text += f"{markdown}\n\n---\n\n"

        # Save grouped file for the letter
        filename = os.path.join(output_dir, f"{letter}.md")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(markdown_text)

        print(f"Saved: {filename}")


In [33]:
base_url = "https://dnd-5e.fandom.com"
category_url = f"{base_url}/wiki/Category:Monsters?from="
headers = {"User-Agent": "Mozilla/5.0"}

output_dir = "../datasets/monsters_markdown"
os.makedirs(output_dir, exist_ok=True)

content(headers, category_url, base_url, output_dir)

Collecting all unique monster links...
Checking: A


100%|██████████| 98/98 [00:00<00:00, 1810756.79it/s]


Checking: B


100%|██████████| 60/60 [00:00<00:00, 1525201.45it/s]


Checking: C


100%|██████████| 48/48 [00:00<00:00, 1088251.85it/s]


Checking: D


100%|██████████| 41/41 [00:00<00:00, 1116665.35it/s]


Checking: E


100%|██████████| 17/17 [00:00<00:00, 579700.55it/s]


Checking: F


100%|██████████| 40/40 [00:00<00:00, 1103764.21it/s]


Checking: G


100%|██████████| 70/70 [00:00<00:00, 1613193.85it/s]


Checking: H


100%|██████████| 31/31 [00:00<00:00, 747261.06it/s]


Checking: I


100%|██████████| 14/14 [00:00<00:00, 493447.53it/s]


Checking: J


100%|██████████| 3/3 [00:00<00:00, 119837.26it/s]


Checking: K


100%|██████████| 12/12 [00:00<00:00, 474826.87it/s]


Checking: L


100%|██████████| 13/13 [00:00<00:00, 395115.59it/s]


Checking: M


KeyboardInterrupt: 

In [34]:
base_url = "https://dnd-5e.fandom.com"
category_url = f"{base_url}/wiki/Category:Races?from="
headers = {"User-Agent": "Mozilla/5.0"}

output_dir = "../datasets/races"
os.makedirs(output_dir, exist_ok=True)

content(headers, category_url, base_url, output_dir)

100%|██████████| 6/6 [00:01<00:00,  3.24it/s]


Saved: ../datasets/races/A.md


100%|██████████| 1/1 [00:00<00:00,  3.03it/s]


Saved: ../datasets/races/B.md


100%|██████████| 4/4 [00:01<00:00,  3.85it/s]


Saved: ../datasets/races/C.md


100%|██████████| 5/5 [00:01<00:00,  3.45it/s]


Saved: ../datasets/races/D.md


100%|██████████| 3/3 [00:01<00:00,  2.47it/s]


Saved: ../datasets/races/E.md


100%|██████████| 2/2 [00:00<00:00,  3.45it/s]


Saved: ../datasets/races/F.md


100%|██████████| 11/11 [00:03<00:00,  3.21it/s]


Saved: ../datasets/races/G.md


100%|██████████| 10/10 [00:03<00:00,  2.81it/s]


Saved: ../datasets/races/H.md


100%|██████████| 46/46 [00:13<00:00,  3.42it/s]


Saved: ../datasets/races/I.md


100%|██████████| 46/46 [00:13<00:00,  3.30it/s]


Saved: ../datasets/races/J.md


100%|██████████| 9/9 [00:02<00:00,  3.47it/s]


Saved: ../datasets/races/K.md


100%|██████████| 5/5 [00:01<00:00,  3.19it/s]


Saved: ../datasets/races/L.md


100%|██████████| 4/4 [00:01<00:00,  3.18it/s]


Saved: ../datasets/races/M.md


100%|██████████| 1/1 [00:00<00:00,  3.47it/s]


Saved: ../datasets/races/N.md


100%|██████████| 2/2 [00:00<00:00,  3.64it/s]


Saved: ../datasets/races/O.md


100%|██████████| 1/1 [00:00<00:00,  3.63it/s]


Saved: ../datasets/races/P.md


100%|██████████| 24/24 [00:07<00:00,  3.28it/s]


Saved: ../datasets/races/Q.md


100%|██████████| 3/3 [00:00<00:00,  3.78it/s]


Saved: ../datasets/races/R.md


100%|██████████| 8/8 [00:02<00:00,  3.54it/s]


Saved: ../datasets/races/S.md


100%|██████████| 6/6 [00:01<00:00,  3.33it/s]


Saved: ../datasets/races/T.md


100%|██████████| 7/7 [00:02<00:00,  3.36it/s]


Saved: ../datasets/races/U.md


100%|██████████| 4/4 [00:01<00:00,  3.21it/s]


Saved: ../datasets/races/V.md


100%|██████████| 1/1 [00:00<00:00,  3.57it/s]


Saved: ../datasets/races/W.md


100%|██████████| 2/2 [00:00<00:00,  3.62it/s]


Saved: ../datasets/races/X.md


100%|██████████| 2/2 [00:00<00:00,  2.98it/s]


Saved: ../datasets/races/Y.md


100%|██████████| 88/88 [00:26<00:00,  3.36it/s]

Saved: ../datasets/races/Z.md





In [35]:
base_url = "https://dnd-5e.fandom.com"
category_url = f"{base_url}/wiki/Category:Classes?from="
headers = {"User-Agent": "Mozilla/5.0"}

output_dir = "../datasets/classes"
os.makedirs(output_dir, exist_ok=True)

content(headers, category_url, base_url, output_dir)

100%|██████████| 2/2 [00:00<00:00,  2.52it/s]


Saved: ../datasets/classes/A.md


100%|██████████| 5/5 [00:01<00:00,  3.12it/s]


Saved: ../datasets/classes/B.md


100%|██████████| 2/2 [00:00<00:00,  2.34it/s]


Saved: ../datasets/classes/C.md


100%|██████████| 3/3 [00:01<00:00,  2.89it/s]


Saved: ../datasets/classes/D.md


100%|██████████| 20/20 [00:07<00:00,  2.66it/s]


Saved: ../datasets/classes/E.md


100%|██████████| 2/2 [00:00<00:00,  2.45it/s]


Saved: ../datasets/classes/F.md


100%|██████████| 18/18 [00:06<00:00,  2.59it/s]


Saved: ../datasets/classes/G.md


100%|██████████| 18/18 [00:06<00:00,  2.65it/s]


Saved: ../datasets/classes/H.md


100%|██████████| 18/18 [00:06<00:00,  2.57it/s]


Saved: ../datasets/classes/I.md


100%|██████████| 18/18 [00:07<00:00,  2.46it/s]


Saved: ../datasets/classes/J.md


100%|██████████| 18/18 [00:06<00:00,  2.59it/s]


Saved: ../datasets/classes/K.md


100%|██████████| 18/18 [00:06<00:00,  2.68it/s]


Saved: ../datasets/classes/L.md


100%|██████████| 3/3 [00:01<00:00,  2.61it/s]


Saved: ../datasets/classes/M.md


100%|██████████| 15/15 [00:05<00:00,  2.59it/s]


Saved: ../datasets/classes/N.md


100%|██████████| 15/15 [00:05<00:00,  2.59it/s]


Saved: ../datasets/classes/O.md


100%|██████████| 2/2 [00:00<00:00,  3.15it/s]


Saved: ../datasets/classes/P.md


100%|██████████| 13/13 [00:05<00:00,  2.50it/s]


Saved: ../datasets/classes/Q.md


100%|██████████| 6/6 [00:02<00:00,  2.53it/s]


Saved: ../datasets/classes/R.md


100%|██████████| 3/3 [00:01<00:00,  2.58it/s]


Saved: ../datasets/classes/S.md


100%|██████████| 4/4 [00:01<00:00,  2.40it/s]


Saved: ../datasets/classes/T.md


100%|██████████| 4/4 [00:01<00:00,  2.26it/s]


Saved: ../datasets/classes/U.md


100%|██████████| 4/4 [00:01<00:00,  2.52it/s]


Saved: ../datasets/classes/V.md


100%|██████████| 4/4 [00:01<00:00,  2.25it/s]


Saved: ../datasets/classes/W.md


100%|██████████| 32/32 [00:11<00:00,  2.70it/s]


Saved: ../datasets/classes/X.md


100%|██████████| 32/32 [00:11<00:00,  2.68it/s]


Saved: ../datasets/classes/Y.md


100%|██████████| 32/32 [00:12<00:00,  2.66it/s]

Saved: ../datasets/classes/Z.md





In [36]:
base_url = "https://dnd-5e.fandom.com"
category_url = f"{base_url}/wiki/Category:Items?from="
headers = {"User-Agent": "Mozilla/5.0"}

output_dir = "../datasets/items"
os.makedirs(output_dir, exist_ok=True)

content(headers, category_url, base_url, output_dir)

100%|██████████| 2/2 [00:01<00:00,  1.24it/s]


Saved: ../datasets/items/A.md


100%|██████████| 7/7 [00:05<00:00,  1.32it/s]


Saved: ../datasets/items/B.md


100%|██████████| 7/7 [00:02<00:00,  2.86it/s]


Saved: ../datasets/items/C.md


100%|██████████| 7/7 [00:02<00:00,  3.11it/s]


Saved: ../datasets/items/D.md


100%|██████████| 2/2 [00:00<00:00,  3.62it/s]


Saved: ../datasets/items/E.md


100%|██████████| 5/5 [00:02<00:00,  2.49it/s]


Saved: ../datasets/items/F.md


100%|██████████| 5/5 [00:01<00:00,  2.80it/s]


Saved: ../datasets/items/G.md


100%|██████████| 5/5 [00:01<00:00,  2.84it/s]


Saved: ../datasets/items/H.md


100%|██████████| 5/5 [00:01<00:00,  2.86it/s]


Saved: ../datasets/items/I.md


100%|██████████| 5/5 [00:01<00:00,  2.64it/s]


Saved: ../datasets/items/J.md


100%|██████████| 5/5 [00:01<00:00,  2.95it/s]


Saved: ../datasets/items/K.md


100%|██████████| 5/5 [00:01<00:00,  2.97it/s]


Saved: ../datasets/items/L.md


100%|██████████| 1/1 [00:00<00:00,  3.33it/s]


Saved: ../datasets/items/M.md


100%|██████████| 4/4 [00:01<00:00,  2.81it/s]


Saved: ../datasets/items/N.md


100%|██████████| 4/4 [00:01<00:00,  3.01it/s]


Saved: ../datasets/items/O.md


100%|██████████| 4/4 [00:01<00:00,  2.85it/s]


Saved: ../datasets/items/P.md


100%|██████████| 4/4 [00:01<00:00,  2.83it/s]


Saved: ../datasets/items/Q.md


100%|██████████| 4/4 [00:01<00:00,  2.69it/s]


Saved: ../datasets/items/R.md


100%|██████████| 4/4 [00:01<00:00,  2.63it/s]


Saved: ../datasets/items/S.md


100%|██████████| 3/3 [00:01<00:00,  2.66it/s]


Saved: ../datasets/items/T.md


100%|██████████| 1/1 [00:00<00:00,  3.30it/s]


Saved: ../datasets/items/U.md


100%|██████████| 1/1 [00:00<00:00,  3.57it/s]


Saved: ../datasets/items/V.md


100%|██████████| 1/1 [00:00<00:00,  2.13it/s]


Saved: ../datasets/items/W.md


100%|██████████| 9/9 [00:03<00:00,  2.74it/s]


Saved: ../datasets/items/X.md


100%|██████████| 9/9 [00:03<00:00,  2.95it/s]


Saved: ../datasets/items/Y.md


100%|██████████| 9/9 [00:02<00:00,  3.08it/s]

Saved: ../datasets/items/Z.md



