# Scraping

this code provide how to scrap the [D&D 5e Wiki](https://dnd-5e.fandom.com/wiki/D%26D_5e_Wiki)

In [1]:
import requests
import re
import os
import pandas as pd
import markdownify 

from io import StringIO
from bs4 import BeautifulSoup, Comment
from tqdm import tqdm
from IPython.display import display

In [2]:
url = requests.get('https://dnd-5e.fandom.com/wiki/List_of_Spells')
soup = BeautifulSoup(url.content, 'html.parser')

# Spell

filter out `Unearthed Arcana`

In [3]:
data = soup.find_all('span', 'mw-headline')
filtered_data = [item for item in data if item.text != 'Unearthed Arcana' and item.text != 'Spell Navigation']

In [4]:
filtered_data

[<span class="mw-headline" id="Cantrips">Cantrips</span>,
 <span class="mw-headline" id="1st_Level">1st Level</span>,
 <span class="mw-headline" id="2nd_Level">2nd Level</span>,
 <span class="mw-headline" id="3rd_Level">3rd Level</span>,
 <span class="mw-headline" id="4th_Level">4th Level</span>,
 <span class="mw-headline" id="5th_Level">5th Level</span>,
 <span class="mw-headline" id="6th_Level">6th Level</span>,
 <span class="mw-headline" id="7th_Level">7th Level</span>,
 <span class="mw-headline" id="8th_Level">8th Level</span>,
 <span class="mw-headline" id="9th_Level">9th Level</span>]

In [5]:
dfs = []

for table in filtered_data:
    table = table.find_next('table')
    html_table = list()
    
    if table:
        html_string = str(table)
        html_table.append(table)
        df = pd.read_html(StringIO(html_string))[0]
        dfs.append(df)

print(len(dfs))
display(dfs[0].head())
display(dfs[1].head())

10


Unnamed: 0,Name,School,Casting Time,Comps,Save,Conc.,Source
0,Acid Splash,Conjuration,1 action,"V, S",Dexterity,,PHB 211
1,Blade Ward,Abjuration,1 action,"V, S",,,PHB 218
2,Booming Blade,Evocation,1 action,"V, M",Attack roll,,"TCE 143, SCAG 142"
3,Chill Touch,Necromancy,1 action,"V, S",Attack roll,,PHB 221
4,Control Flames,Transmutation,1 action,S,,,"XGE 152, EEPC 16"


Unnamed: 0,Name,School,Casting Time,Comps,Cost,Save,Ritual,Conc.,Source
0,Absorb Elements,Abjuration,1 reaction,S,,,,,"XGE 150, EEPC 15"
1,Alarm,Abjuration,1 minute,"V, S, M",,,Yes,,PHB 211
2,Animal Friendship,Enchantment,1 action,"V, S, M",,Wisdom,,,PHB 212
3,Armor of Agathys,Abjuration,1 action,"V, S, M",,,,,PHB 215
4,Arms of Hadar,Conjuration,1 action,"V, S",,Strength,,,PHB 215


In [6]:
os.makedirs("../datasets/spell/", exist_ok=True)

for title, df in zip(filtered_data, dfs):
    df.to_csv(f"../datasets/spell/{title.text}.csv", index=False)

In [7]:
filtered_data

[<span class="mw-headline" id="Cantrips">Cantrips</span>,
 <span class="mw-headline" id="1st_Level">1st Level</span>,
 <span class="mw-headline" id="2nd_Level">2nd Level</span>,
 <span class="mw-headline" id="3rd_Level">3rd Level</span>,
 <span class="mw-headline" id="4th_Level">4th Level</span>,
 <span class="mw-headline" id="5th_Level">5th Level</span>,
 <span class="mw-headline" id="6th_Level">6th Level</span>,
 <span class="mw-headline" id="7th_Level">7th Level</span>,
 <span class="mw-headline" id="8th_Level">8th Level</span>,
 <span class="mw-headline" id="9th_Level">9th Level</span>]

In [8]:
data = soup.find_all('span', 'mw-headline')
filtered_data = [item for item in data if item.text != 'Unearthed Arcana' and item.text != 'Spell Navigation']  
os.makedirs("../datasets/spell_content", exist_ok=True)

for table in filtered_data:
    file_name = table.text
    table = table.find_next('table')
    data = list()

    if table:
        anchors = table.find_all('a')
        
        for anchor in tqdm(anchors):
            url = requests.get(f'https://dnd-5e.fandom.com{anchor['href']}')
            soup = BeautifulSoup(url.content, 'html.parser')
            name_tag = soup.find('span', class_='mw-page-title-main')

            h2_tags = soup.find_all('h2')

            for h2_tag in h2_tags:
                headline = h2_tag.find('span', class_='mw-headline')
                if headline:
                    new_h2 = soup.new_tag("h2")
                    new_h2.string = headline.text.strip()
                    h2_tag.replace_with(new_h2)

            content_block = soup.find('div', 'mw-content-ltr mw-parser-output')
            content_children = content_block.findChildren()
            cleaned_content = []

            for element in content_children:
                markdown = markdownify.markdownify(str(element), strip=['a', 'span'], heading_style="ATX")
                cleaned_content.append(markdown)

            combined_content = f"# {name_tag.text.strip()}\n## Spell Name\n{name_tag.text.strip()}\n\n" + "\n".join(cleaned_content) + "\n"
            combined_content = re.sub(r'<!--.*?-->', '', combined_content, flags=re.DOTALL)

            data.append(combined_content)

        with open(f"../datasets/spell_content/{file_name}.txt", "w", encoding="utf-8") as f:
            for item in data:
                f.write(f"{item}\n")

  content_children = content_block.findChildren()
100%|██████████| 46/46 [00:22<00:00,  2.08it/s]
100%|██████████| 79/79 [00:32<00:00,  2.46it/s]
100%|██████████| 85/85 [00:39<00:00,  2.14it/s]
100%|██████████| 73/73 [00:33<00:00,  2.21it/s]
100%|██████████| 51/51 [00:19<00:00,  2.60it/s]
100%|██████████| 61/61 [00:26<00:00,  2.29it/s]
100%|██████████| 48/48 [00:21<00:00,  2.25it/s]
100%|██████████| 28/28 [00:11<00:00,  2.41it/s]
100%|██████████| 24/24 [00:11<00:00,  2.14it/s]
100%|██████████| 22/22 [00:11<00:00,  1.95it/s]
