In [43]:
from bs4 import BeautifulSoup
import requests
import json
import csv
from urllib.request import urlopen as uReq
import pandas as pd
import numpy as np
import re
import urllib.request
import unicodedata
import urllib.parse




1. Scrape persona name, arcana, level
2. Get ability (inherit, weak, resist, null, reflect)
3. Get skills list





In [44]:
# Define link
url = "https://megamitensei.fandom.com/wiki/List_of_Persona_5_Personas"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [45]:
# Find all arcana
arc = soup.find_all("span", class_="mw-headline")
print(len(arc))

24


In [46]:
print(arc[:5])

[<span class="mw-headline" id="Fool"><a href="/wiki/Fool_Arcana" title="Fool Arcana">Fool</a></span>, <span class="mw-headline" id="Magician"><a href="/wiki/Magician_Arcana" title="Magician Arcana">Magician</a></span>, <span class="mw-headline" id="Priestess"><a href="/wiki/Priestess_Arcana" title="Priestess Arcana">Priestess</a></span>, <span class="mw-headline" id="Empress"><a href="/wiki/Empress_Arcana" title="Empress Arcana">Empress</a></span>, <span class="mw-headline" id="Emperor"><a href="/wiki/Emperor_Arcana" title="Emperor Arcana">Emperor</a></span>]


In [47]:
# Create personas list
persona_dict = []

for arcana_header in arc:
    arcana_name = arcana_header.get_text(strip=True)

    table = arcana_header.find_next("table", class_="table p5")

    if table:
        personas = []

        # Extract persona names from table rows
        for row in table.find_all("tr")[1:]:
            cells = row.find_all(["th","td"])

            for i in range(0, len(cells)-1,2):
              level = cells[i].get_text(strip=True)
              persona_tag = cells[i+1].find("a")

              if persona_tag:
                persona_name = persona_tag.get_text(strip=True)
                persona_dict.append((arcana_name, level, persona_name))


In [48]:
persona_df = pd.DataFrame(persona_dict, columns=["Arcana", "Level", "Persona"])
persona_df

Unnamed: 0,Arcana,Level,Persona
0,Fool,1*,Arsène
1,Fool,8,Obariyon
2,Fool,16,High Pixie
3,Fool,20↓,Izanagi
4,Fool,23↓,Izanagi Picaro
...,...,...,...
221,Judgement,81↓,Messiah
222,Judgement,82,Shiva
223,Judgement,87,Michael
224,Judgement,90↓,Messiah Picaro


In [49]:
persona_df.shape

(226, 3)

In [50]:
# Retrieve persona's name only from persona_df
persona_namelist = []

arcana = arcana_header('table', {'class':'table p5'})
arcana_l = len(arcana)

for arc in range(arcana_l):
    pname = arcana[arc].find_all('a')
    pname_l = len(pname)
    for pn in range(pname_l):
        name = pname[pn].text
        if pname[pn].previous_element.previous_element.previous_element != '**':
            persona_namelist.append(name)

In [51]:
persona_namelist = persona_df['Persona'].tolist()
persona_namelist[0]

'Arsène'

Get ability

In [52]:
# Rename persona for accessing links
persona_namelist[0] = "Arsene"
persona_namelist[75] = "Kushinada-Hime"

In [53]:
links = []
for p in persona_namelist:
    links.append("https://megamitensei.fandom.com/wiki/" + p.replace(' ', '_'))



*   Picaro personas cannot be processed because there are no main page of them
*   Some links cannot be accessed: OTW
*   Extract data from selected tabber element: OTW



In [54]:
# NO SKILL DETAIL
counter = 0
all_data = []
unaccessible_links = []  # List of unaccessible links

for link in links:
    page_url = link
    safe_url = urllib.parse.quote(page_url, safe=':/')

    try:
        with uReq(safe_url) as uClient:
            page_content = uClient.read().decode("utf-8")

        page_soup = BeautifulSoup(page_content, "html.parser")

        # Extract the Persona name
        persona_span = page_soup.find("span", {"class": "mw-page-title-main"})
        persona_name = persona_span.text.strip() if persona_span else "Unknown"

        target_span = page_soup.find("span", {"id": "Persona_5_.2F_Royal"})

        if target_span:
            persona5_table = target_span.find_next("table", {"class": "customtable"})

            if persona5_table:
                rows = persona5_table.find_all("tr")
                data = {"Persona": persona_name}

                stat_table = rows[0].find("table")  # Rowspans table
                if stat_table:
                    for row in stat_table.find_all("tr"):
                        cols = row.find_all("td")
                        if len(cols) >= 2:
                            stat_name = cols[0].text.strip()
                            stat_value = cols[1].text.strip()
                            data[stat_name] = stat_value

                # ------------------------ Convert to DataFrame ------------------------
                df = pd.DataFrame([data])
                all_data.append(df)

                counter += 1
                print(f"Processed {counter}/{len(links)}: {link}")

            else:
                print(f"No Table found for {link}")
                unaccessible_links.append(link)
        else:
            print(f"No Persona 5 section for {link}")
            unaccessible_links.append(link)

    except Exception as e:
        print(f"Error processing {link}: {e}")
        unaccessible_links.append(link)

# Combine all data into a DataFrame
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    print("Dataframe completed")
else:
    print("¯\_(ツ)_/¯")

No Table found for https://megamitensei.fandom.com/wiki/Arsene
Processed 1/226: https://megamitensei.fandom.com/wiki/Obariyon
No Persona 5 section for https://megamitensei.fandom.com/wiki/High_Pixie
Processed 2/226: https://megamitensei.fandom.com/wiki/Izanagi
Processed 3/226: https://megamitensei.fandom.com/wiki/Izanagi_Picaro
Processed 4/226: https://megamitensei.fandom.com/wiki/Orpheus
Error processing https://megamitensei.fandom.com/wiki/Orpheus_Picaro: HTTP Error 404: Not Found
Processed 5/226: https://megamitensei.fandom.com/wiki/Decarabia
Processed 6/226: https://megamitensei.fandom.com/wiki/Legion
Processed 7/226: https://megamitensei.fandom.com/wiki/Ose
Processed 8/226: https://megamitensei.fandom.com/wiki/Bugs
No Persona 5 section for https://megamitensei.fandom.com/wiki/Crystal_Skull
Processed 9/226: https://megamitensei.fandom.com/wiki/Dionysus
Processed 10/226: https://megamitensei.fandom.com/wiki/Black_Frost
Processed 11/226: https://megamitensei.fandom.com/wiki/Vishnu
Pr

In [55]:
final_df

Unnamed: 0,Persona,Strength,Magic,Endurance,Agility,Luck,Vitality,Dexterity
0,Obariyon,14,8,11,12,7,,
1,Izanagi,14,13,13,14,13,,
2,Izanagi,14,13,13,14,13,,
3,Orpheus,17,17,17,17,17,,
4,Decarabia,22,32,19,24,20,,
...,...,...,...,...,...,...,...,...
197,Yamata-no-Orochi,99,99,40,48,99,,
198,Abaddon,99,45,5,40,99,,
199,Shiva,55,54,53,53,38,,
200,Michael,68,38,51,49,29,,


In [56]:
persona5_df = persona_df.merge(final_df, on="Persona", how="outer")
persona5_df = persona5_df.drop(columns=['Vitality','Dexterity'])
persona5_df

Unnamed: 0,Arcana,Level,Persona,Strength,Magic,Endurance,Agility,Luck
0,Judgement,74,Abaddon,99,45,5,40,99
1,Chariot,3,Agathion,4,5,9,6,5
2,Death,79★,Alice,43,59,40,57,45
3,Lovers,29,Ame-no-Uzume,15,22,19,20,18
4,Star,43,Ananta,24,30,31,26,25
...,...,...,...,...,...,...,...,...
231,Tower,79,Yoshitsune,58,47,45,53,41
232,Sun,42,Yurlungur,26,29,28,27,23
233,Strength,80★,Zaou-Gongen,57,45,50,56,39
234,Magician,1*,Zorro,2,3,1,3,1


In [57]:
for failed_link in unaccessible_links:
        print(failed_link)

https://megamitensei.fandom.com/wiki/Arsene
https://megamitensei.fandom.com/wiki/High_Pixie
https://megamitensei.fandom.com/wiki/Orpheus_Picaro
https://megamitensei.fandom.com/wiki/Crystal_Skull
https://megamitensei.fandom.com/wiki/Koh-i-Noor
https://megamitensei.fandom.com/wiki/Milady
https://megamitensei.fandom.com/wiki/Queen's_Necklace
https://megamitensei.fandom.com/wiki/Regent
https://megamitensei.fandom.com/wiki/Phoenix
https://megamitensei.fandom.com/wiki/Bishamonten
https://megamitensei.fandom.com/wiki/Loki
https://megamitensei.fandom.com/wiki/Stone_of_Scone
https://megamitensei.fandom.com/wiki/Ariadne_Picaro
https://megamitensei.fandom.com/wiki/Asterius_Picaro
https://megamitensei.fandom.com/wiki/Orlov
https://megamitensei.fandom.com/wiki/Emperor's_Amulet
https://megamitensei.fandom.com/wiki/Hope_Diamond
https://megamitensei.fandom.com/wiki/Thanatos_Picaro
https://megamitensei.fandom.com/wiki/Magatsu-Izanagi_Picaro
https://megamitensei.fandom.com/wiki/Seth
https://megamitensei

In [58]:
# Extract skills
page_url = 'https://megamitensei.fandom.com/wiki/List_of_Persona_5_Skills'
uClient = uReq(page_url)
page_soup = BeautifulSoup(uClient.read(), "html.parser")
uClient.close()

In [None]:
# Extract skills name, effect, cost, category
def extract_skills_from_table(table, category, cost_index):
    skill_data = []
    if table:
        rows = table.find_all("tr")[1:]
        for row in rows:
            cols = row.find_all("td")
            if len(cols) > cost_index:
                skill = cols[0].text.strip()
                effect = cols[1].text.strip()
                cost = cols[cost_index].text.strip()
                skill_data.append([skill, effect, cost, category])
    return skill_data

all_skills_list = []

# Phys Skills
physical_skills_section = page_soup.find("span", {"id": "Physical_Skills"})
if physical_skills_section:
    physical_skills_table = physical_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(physical_skills_table, "Physical", cost_index=5))

# Gun Skills
gun_skills_section = page_soup.find("span", {"id": "Gun"})
if physical_skills_section:
    physical_skills_table = physical_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(physical_skills_table, "Gun", cost_index=5))

# Magic Skills
magic_categories = ["Fire", "Ice", "Electric", "Wind", "Psychokinesis", "Nuclear", "Bless", "Curse", "Almighty"]
for magic_type in magic_categories:
    magic_skills_section = page_soup.find("span", {"id": magic_type})
    if magic_skills_section:
        magic_skills_table = magic_skills_section.find_next("table", {"class": "table p5"})
        all_skills_list.extend(extract_skills_from_table(magic_skills_table, magic_type, cost_index=4))

# Ailment Skills
ailment_cat = ["Physiological", "Mental"]
for ailment_type in ailment_cat:
    ailment_skills_section = page_soup.find("span", {"id": ailment_type})
    if ailment_skills_section:
        ailment_skills_table = ailment_skills_section.find_next("table", {"class": "table p5"})
        all_skills_list.extend(extract_skills_from_table(ailment_skills_table, ailment_type, cost_index=4))

# Healing Skills
heal_skills_section = page_soup.find("span", {"id": "Healing_Skills"})
if heal_skills_section:
    heal_skills_table = heal_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(heal_skills_table, "Healing", cost_index=3))

# Support Skills
supp_skills_section = page_soup.find("span",{"id" : "Support_Skills"})
if supp_skills_section:
  supp_skills_table = supp_skills_section.find_next("table",{"class": "table p5"})
  all_skills_list.extend(extract_skills_from_table(supp_skills_table, "Support", cost_index=2))

# Passive Skills
pass_skills_section = page_soup.find("span",{"id" : "Passive_Skills"})
if pass_skills_section:
  pass_skills_table = pass_skills_section.find_next("table",{"class": "table p5"})
  all_skills_list.extend(extract_skills_from_table(pass_skills_table, "Passive",cost_index=False))

df_skills = pd.DataFrame(all_skills_list, columns=["Skill", "Effect", "Cost", "Category"])
df_skills.loc[df_skills["Category"] == "Passive", "Cost"] = "-"

In [None]:
df_skills

Unnamed: 0,Skill,Effect,Cost,Category
0,Lunge,Light Physical damage to 1 foe.,5% HP,Physical
1,Cleave,Light Physical damage to 1 foe.,6% HP,Physical
2,Giant Slice,Medium Physical damage to 1 foe.,9% HP,Physical
3,Assault Dive,Heavy Physical damage to 1 foe.,13% HP,Physical
4,Megaton Raid,Severe Physical damage to 1 foe.,16% HP,Physical
...,...,...,...,...
189,Marakunda,Decrease all foes' Defense for 3 turns.,24 SP,Support
190,Masukunda,Decrease all foes' Agility for 3 turns.,24 SP,Support
191,Counter,10% chance of reflecting Physical attacks.,-,Passive
192,Counterstrike,15% chance of reflecting Physical attacks.,-,Passive


In [None]:
for failed_link in unaccessible_links:
        print(failed_link)

https://megamitensei.fandom.com/wiki/Arsene
https://megamitensei.fandom.com/wiki/High_Pixie
https://megamitensei.fandom.com/wiki/Orpheus_Picaro
https://megamitensei.fandom.com/wiki/Crystal_Skull
https://megamitensei.fandom.com/wiki/Koh-i-Noor
https://megamitensei.fandom.com/wiki/Milady
https://megamitensei.fandom.com/wiki/Queen's_Necklace
https://megamitensei.fandom.com/wiki/Regent
https://megamitensei.fandom.com/wiki/Phoenix
https://megamitensei.fandom.com/wiki/Bishamonten
https://megamitensei.fandom.com/wiki/Loki
https://megamitensei.fandom.com/wiki/Stone_of_Scone
https://megamitensei.fandom.com/wiki/Ariadne_Picaro
https://megamitensei.fandom.com/wiki/Asterius_Picaro
https://megamitensei.fandom.com/wiki/Orlov
https://megamitensei.fandom.com/wiki/Emperor's_Amulet
https://megamitensei.fandom.com/wiki/Hope_Diamond
https://megamitensei.fandom.com/wiki/Thanatos_Picaro
https://megamitensei.fandom.com/wiki/Andras
https://megamitensei.fandom.com/wiki/Magatsu-Izanagi_Picaro
https://megamitens

In [63]:
import urllib.parse
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
import pandas as pd

# Define the target URL
page_url = "https://megamitensei.fandom.com/wiki/Rangda"

# Open the page and parse
with uReq(page_url) as uClient:
    page_soup = BeautifulSoup(uClient.read(), "html.parser")

# Extract the Persona name
persona_span = page_soup.find("span", {"class": "mw-page-title-main"})
persona_name = persona_span.text.strip() if persona_span else "Unknown"

# Find Persona 5 Royal section
persona5_section = page_soup.find("span", {"id": "Persona_5_.2F_Royal"})

if persona5_section:
    # Find the tabber div associated with it
    p5r_tabber = persona5_section.find_next("div", {"class": "tabber wds-tabber"})

    if p5r_tabber:
        # Find all tables with class "customtable"
        customtables = p5r_tabber.find_all("table", {"class": "customtable"})

        # Check if a second table exists
        if len(customtables) >= 2:
            second_table = customtables[1]  # The second table

            # Extract table headers
            headers = [th.get_text(strip=True) for th in second_table.find_all("th")]

            # Extract table rows
            rows = []
            for row in second_table.find_all("tr"):
                cells = row.find_all("td")
                row_data = [cell.get_text(strip=True) for cell in cells]
                if row_data:  # Avoid empty rows
                    rows.append(row_data)

            # Convert to DataFrame
            df = pd.DataFrame(rows, columns=headers) if headers else pd.DataFrame(rows)

            # Insert Persona name as the first column
            df.insert(0, "Persona", persona_name)

        else:
            print("Second customtable not found.")
    else:
        print("Persona 5 Royal tabber not found.")
else:
    print("Persona 5 Royal section not found.")


In [64]:
df

Unnamed: 0,Persona,Phys,Gun,Fire,Ice,Elec,Wind,Psy,Nuke,Bless,Curse,Almi
0,Rangda,Repel,Repel,Null,-,Weak,-,-,-,Weak,Null,20


In [72]:
# List to store extracted data
all_data = []

# Iterate through each link
for link in links:
    safe_url = urllib.parse.quote(link, safe=':/')  # Handle special characters in URL

    try:
        with uReq(safe_url) as uClient:
            page_soup = BeautifulSoup(uClient.read(), "html.parser")

        print(f"Processing: {link}")

        # Extract the Persona name
        persona_span = page_soup.find("span", {"class": "mw-page-title-main"})
        persona_name = persona_span.text.strip() if persona_span else "Unknown"

        # Find the Persona 5 Royal section
        persona5_section = page_soup.find("span", {"id": "Persona_5_.2F_Royal"})

        if persona5_section:
            # Find the associated tabber div
            p5r_tabber = persona5_section.find_next("div", {"class": "tabber wds-tabber"})

            if p5r_tabber:
                # Find all tables with class "customtable"
                customtables = p5r_tabber.find_all("table", {"class": "customtable"})

                if len(customtables) >= 2:  # Ensure a second table exists
                    second_table = customtables[1]

                    # Extract table headers
                    headers = [th.get_text(strip=True) for th in second_table.find_all("th")]

                    # Extract table rows
                    rows = []
                    for row in second_table.find_all("tr"):
                        cells = row.find_all("td")
                        row_data = [cell.get_text(strip=True) for cell in cells]
                        if row_data:  # Avoid empty rows
                            rows.append(row_data)

                    # Convert extracted data into a structured format
                    for row in rows:
                        row_dict = {"Persona": persona_name}
                        for i, value in enumerate(row):
                            column_name = headers[i] if i < len(headers) else f"Column_{i+1}"
                            row_dict[column_name] = value
                        all_data.append(row_dict)

                else:
                    print(f"Second customtable not found for {persona_name} ({link})")
            else:
                print(f"Persona 5 Royal tabber not found for {persona_name} ({link})")
        else:
            print(f"No Persona 5 Royal section for {persona_name} ({link})")

    except Exception as e:
        print(f"Error processing {persona_name} ({link}): {e}")

# Convert extracted data into a DataFrame and display it
df = pd.DataFrame(all_data)

Processing: https://megamitensei.fandom.com/wiki/Arsene
Persona 5 Royal tabber not found for Arsène (https://megamitensei.fandom.com/wiki/Arsene)
Processing: https://megamitensei.fandom.com/wiki/Obariyon
Persona 5 Royal tabber not found for Obariyon (https://megamitensei.fandom.com/wiki/Obariyon)
Processing: https://megamitensei.fandom.com/wiki/High_Pixie
No Persona 5 Royal section for High Pixie (https://megamitensei.fandom.com/wiki/High_Pixie)
Processing: https://megamitensei.fandom.com/wiki/Izanagi
Processing: https://megamitensei.fandom.com/wiki/Izanagi_Picaro
Processing: https://megamitensei.fandom.com/wiki/Orpheus
Error processing Orpheus (https://megamitensei.fandom.com/wiki/Orpheus_Picaro): HTTP Error 404: Not Found
Processing: https://megamitensei.fandom.com/wiki/Decarabia
Processing: https://megamitensei.fandom.com/wiki/Legion
Persona 5 Royal tabber not found for Legion (https://megamitensei.fandom.com/wiki/Legion)
Processing: https://megamitensei.fandom.com/wiki/Ose
Persona 

In [73]:
df

Unnamed: 0,Persona,Inherit,Reflects,Absorbs,Block,Resists,Weak,Phys,Gun,Fire,...,Bless,Curse,Almi,Void,Force,Light,Dark,List of Skills,Skill,Cost
0,Izanagi,Almighty,-,-,Curse,Electricity,Wind,,,,...,,,,,,,,,,
1,Izanagi,Almighty,-,-,Curse,Electricity,Wind,,,,...,,,,,,,,,,
2,Orpheus,Almighty,-,-,-,Bless,"Electricity, Curse",,,,...,,,,,,,,,,
3,Decarabia,,,,,,,Weak,-,Repel,...,-,Strong,20,,,,,,,
4,Dionysus,,,,,,,-,-,-,...,Strong,Strong,-,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,Yamata-no-Orochi,,,,,,,-,-,-,...,-,Strong,-,,,,,,,
129,Abaddon,,,,,,,Drain,Drain,-,...,-,Drain,-,,,,,,,
130,Shiva,Psy,Electricity,Psy,"Ice, Bless, Curse",-,Nuclear,,,,...,,,,,,,,,,
131,Michael,,,,,,,-,-,-,...,Block,-,-,,,,,,,


