In [1]:
from bs4 import BeautifulSoup
import requests
import json
import csv
from urllib.request import urlopen as uReq
import pandas as pd
import numpy as np
import re
import urllib.request
import unicodedata
import urllib.parse




1. Scrape persona name, arcana, level
2. Get ability (inherit, weak, resist, null, reflect)
3. Get skills list





In [2]:
# Define link
url = "https://megamitensei.fandom.com/wiki/List_of_Persona_5_Personas"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [3]:
# Find all arcana
arc = soup.find_all("span", class_="mw-headline")
print(len(arc))

24


In [4]:
print(arc[:5])

[<span class="mw-headline" id="Fool"><a href="/wiki/Fool_Arcana" title="Fool Arcana">Fool</a></span>, <span class="mw-headline" id="Magician"><a href="/wiki/Magician_Arcana" title="Magician Arcana">Magician</a></span>, <span class="mw-headline" id="Priestess"><a href="/wiki/Priestess_Arcana" title="Priestess Arcana">Priestess</a></span>, <span class="mw-headline" id="Empress"><a href="/wiki/Empress_Arcana" title="Empress Arcana">Empress</a></span>, <span class="mw-headline" id="Emperor"><a href="/wiki/Emperor_Arcana" title="Emperor Arcana">Emperor</a></span>]


In [5]:
# Create personas list
persona_dict = []

for arcana_header in arc:
    arcana_name = arcana_header.get_text(strip=True)

    table = arcana_header.find_next("table", class_="table p5")

    if table:
        personas = []

        # Extract persona names from table rows
        for row in table.find_all("tr")[1:]:
            cells = row.find_all(["th","td"])

            for i in range(0, len(cells)-1,2):
              level = cells[i].get_text(strip=True)
              persona_tag = cells[i+1].find("a")

              if persona_tag:
                persona_name = persona_tag.get_text(strip=True)
                persona_dict.append((arcana_name, level, persona_name))


In [6]:
persona_df = pd.DataFrame(persona_dict, columns=["Arcana", "Level", "Persona"])
persona_df

Unnamed: 0,Arcana,Level,Persona
0,Fool,1*,Arsène
1,Fool,8,Obariyon
2,Fool,16,High Pixie
3,Fool,20↓,Izanagi
4,Fool,23↓,Izanagi Picaro
...,...,...,...
221,Judgement,81↓,Messiah
222,Judgement,82,Shiva
223,Judgement,87,Michael
224,Judgement,90↓,Messiah Picaro


In [7]:
persona_df.shape

(226, 3)

In [8]:
# Retrieve persona's name only from persona_df
persona_namelist = []

arcana = arcana_header('table', {'class':'table p5'})
arcana_l = len(arcana)

for arc in range(arcana_l):
    pname = arcana[arc].find_all('a')
    pname_l = len(pname)
    for pn in range(pname_l):
        name = pname[pn].text
        if pname[pn].previous_element.previous_element.previous_element != '**':
            persona_namelist.append(name)

In [9]:
persona_namelist = persona_df['Persona'].tolist()
persona_namelist[0]

'Arsène'

Get ability

In [10]:
# Rename persona for accessing links
persona_namelist[0] = "Arsene"
persona_namelist[75] = "Kushinada-Hime"

In [11]:
links = []
for p in persona_namelist:
    links.append("https://megamitensei.fandom.com/wiki/" + p.replace(' ', '_'))



*   Picaro personas cannot be processed because there are no main page of them
*   Some links cannot be accessed: OTW
*   Extract data from selected tabber element: OTW



In [12]:
counter = 0
all_data = []
unaccessible_links = []  # List of unaccessible links

for link in links:
    page_url = link
    safe_url = urllib.parse.quote(page_url, safe=':/')

    try:
        with uReq(safe_url) as uClient:
            page_content = uClient.read().decode("utf-8")

        page_soup = BeautifulSoup(page_content, "html.parser")

        # Extract the Persona name
        persona_span = page_soup.find("span", {"class": "mw-page-title-main"})


        target_span = page_soup.find("span", {"id": "Persona_5_.2F_Royal"})

        if target_span:
            persona5_table = target_span.find_next("table", {"class": "customtable"})

            if persona5_table:
                rows = persona5_table.find_all("tr")
                data = {"Persona": persona_name}  # Include Persona name

                # Extract Arcana and Level (first two rows)
                # arcana = rows[1].find_all("td")[0].text.strip()
                # level = rows[1].find_all("td")[1].text.strip()
                # data["Arcana"] = arcana
                # data["Level"] = level

                stat_table = rows[0].find("table")  # Rowspans table
                if stat_table:
                    for row in stat_table.find_all("tr"):
                        cols = row.find_all("td")
                        if len(cols) >= 2:
                            stat_name = cols[0].text.strip()
                            stat_value = cols[1].text.strip()
                            data[stat_name] = stat_value

                # Convert to DataFrame
                df = pd.DataFrame([data])
                all_data.append(df)

                counter += 1
                print(f"Processed {counter}/{len(links)}: {link}")

            else:
                print(f"No Table found for {link}")
                unaccessible_links.append(link)
        else:
            print(f"No Persona 5 section for {link}")
            unaccessible_links.append(link)

    except Exception as e:
        print(f"Error processing {link}: {e}")
        unaccessible_links.append(link)

# Combine all data into a DataFrame
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    print("Dataframe completed")
else:
    print("¯\_(ツ)_/¯")

No Table found for https://megamitensei.fandom.com/wiki/Arsene
Processed 1/226: https://megamitensei.fandom.com/wiki/Obariyon
No Persona 5 section for https://megamitensei.fandom.com/wiki/High_Pixie
Processed 2/226: https://megamitensei.fandom.com/wiki/Izanagi
Processed 3/226: https://megamitensei.fandom.com/wiki/Izanagi_Picaro
Processed 4/226: https://megamitensei.fandom.com/wiki/Orpheus
Error processing https://megamitensei.fandom.com/wiki/Orpheus_Picaro: HTTP Error 404: Not Found
Processed 5/226: https://megamitensei.fandom.com/wiki/Decarabia
Processed 6/226: https://megamitensei.fandom.com/wiki/Legion
Processed 7/226: https://megamitensei.fandom.com/wiki/Ose
Processed 8/226: https://megamitensei.fandom.com/wiki/Bugs
No Persona 5 section for https://megamitensei.fandom.com/wiki/Crystal_Skull
Processed 9/226: https://megamitensei.fandom.com/wiki/Dionysus
Processed 10/226: https://megamitensei.fandom.com/wiki/Black_Frost
Processed 11/226: https://megamitensei.fandom.com/wiki/Vishnu
Pr

In [None]:
persona5_df = persona_df.merge(final_df, on="Persona", how="outer")
# Display the merged dataframe
persona5_df

In [None]:
final_df

Unnamed: 0,Persona,Arcana,Level,Strength,Magic,Endurance,Agility,Luck,Vitality,Dexterity
0,Obariyon,Strength,14,14,8,11,12,7,,
1,Izanagi,Strength,14,14,13,13,14,13,,
2,Izanagi,Strength,14,14,13,13,14,13,,
3,Orpheus,Strength,17,17,17,17,17,17,,
4,Decarabia,Strength,22,22,32,19,24,20,,
...,...,...,...,...,...,...,...,...,...,...
196,Yamata-no-Orochi,Strength,99,99,99,40,48,99,,
197,Abaddon,Strength,99,99,45,5,40,99,,
198,Shiva,Strength,55,55,54,53,53,38,,
199,Michael,Strength,68,68,38,51,49,29,,


In [None]:
for failed_link in unaccessible_links:
        print(failed_link)

https://megamitensei.fandom.com/wiki/Arsene
https://megamitensei.fandom.com/wiki/High_Pixie
https://megamitensei.fandom.com/wiki/Orpheus_Picaro
https://megamitensei.fandom.com/wiki/Crystal_Skull
https://megamitensei.fandom.com/wiki/Koh-i-Noor
https://megamitensei.fandom.com/wiki/Milady
https://megamitensei.fandom.com/wiki/Queen's_Necklace
https://megamitensei.fandom.com/wiki/Mother_Harlot
https://megamitensei.fandom.com/wiki/Regent
https://megamitensei.fandom.com/wiki/Phoenix
https://megamitensei.fandom.com/wiki/Bishamonten
https://megamitensei.fandom.com/wiki/Kohryu
https://megamitensei.fandom.com/wiki/Loki
https://megamitensei.fandom.com/wiki/Stone_of_Scone
https://megamitensei.fandom.com/wiki/Ariadne_Picaro
https://megamitensei.fandom.com/wiki/Asterius_Picaro
https://megamitensei.fandom.com/wiki/Orlov
https://megamitensei.fandom.com/wiki/Siegfried
https://megamitensei.fandom.com/wiki/Emperor's_Amulet
https://megamitensei.fandom.com/wiki/Hecatoncheires
https://megamitensei.fandom.com

In [None]:
final_df

Unnamed: 0,Persona,Arcana,Level,Strength,Magic,Endurance,Agility,Luck,Vitality,Dexterity
0,Obariyon,Strength,14,14,8,11,12,7,,
1,Izanagi,Strength,14,14,13,13,14,13,,
2,Izanagi,Strength,14,14,13,13,14,13,,
3,Orpheus,Strength,17,17,17,17,17,17,,
4,Decarabia,Strength,22,22,32,19,24,20,,
...,...,...,...,...,...,...,...,...,...,...
196,Yamata-no-Orochi,Strength,99,99,99,40,48,99,,
197,Abaddon,Strength,99,99,45,5,40,99,,
198,Shiva,Strength,55,55,54,53,53,38,,
199,Michael,Strength,68,68,38,51,49,29,,


In [None]:
final_df.sort_values(by=['Persona'])

Unnamed: 0,Persona,Arcana,Level,Strength,Magic,Endurance,Agility,Luck,Vitality,Dexterity
197,Abaddon,Strength,99,99,45,5,40,99,,
72,Agathion,Strength,4,4,5,9,6,5,,
137,Alice,Strength,43,43,59,40,57,45,,
64,Ame-no-Uzume,Strength,15,15,22,19,20,18,,
169,Ananta,Strength,24,24,30,31,26,25,,
...,...,...,...,...,...,...,...,...,...,...
163,Yoshitsune,Strength,58,58,47,45,53,41,,
188,Yurlungur,Strength,26,26,29,28,27,23,,
119,Zaou-Gongen,Strength,57,57,45,50,56,39,,
12,Zorro,Strength,2,2,3,1,3,1,,


In [None]:
# Extract skills
page_url = 'https://megamitensei.fandom.com/wiki/List_of_Persona_5_Skills'
uClient = uReq(page_url)
page_soup = BeautifulSoup(uClient.read(), "html.parser")
uClient.close()

In [None]:
# Extract skills name, effect, cost, category
def extract_skills_from_table(table, category, cost_index):
    skill_data = []
    if table:
        rows = table.find_all("tr")[1:]
        for row in rows:
            cols = row.find_all("td")
            if len(cols) > cost_index:
                skill = cols[0].text.strip()
                effect = cols[1].text.strip()
                cost = cols[cost_index].text.strip()
                skill_data.append([skill, effect, cost, category])
    return skill_data

all_skills_list = []

# Phys Skills
physical_skills_section = page_soup.find("span", {"id": "Physical_Skills"})
if physical_skills_section:
    physical_skills_table = physical_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(physical_skills_table, "Physical", cost_index=5))

# Gun Skills
gun_skills_section = page_soup.find("span", {"id": "Gun"})
if physical_skills_section:
    physical_skills_table = physical_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(physical_skills_table, "Gun", cost_index=5))

# Magic Skills
magic_categories = ["Fire", "Ice", "Electric", "Wind", "Psychokinesis", "Nuclear", "Bless", "Curse", "Almighty"]
for magic_type in magic_categories:
    magic_skills_section = page_soup.find("span", {"id": magic_type})
    if magic_skills_section:
        magic_skills_table = magic_skills_section.find_next("table", {"class": "table p5"})
        all_skills_list.extend(extract_skills_from_table(magic_skills_table, magic_type, cost_index=4))

# Ailment Skills
ailment_cat = ["Physiological", "Mental"]
for ailment_type in ailment_cat:
    ailment_skills_section = page_soup.find("span", {"id": ailment_type})
    if ailment_skills_section:
        ailment_skills_table = ailment_skills_section.find_next("table", {"class": "table p5"})
        all_skills_list.extend(extract_skills_from_table(ailment_skills_table, ailment_type, cost_index=4))

# Healing Skills
heal_skills_section = page_soup.find("span", {"id": "Healing_Skills"})
if heal_skills_section:
    heal_skills_table = heal_skills_section.find_next("table", {"class": "table p5"})
    all_skills_list.extend(extract_skills_from_table(heal_skills_table, "Healing", cost_index=3))

# Support Skills
supp_skills_section = page_soup.find("span",{"id" : "Support_Skills"})
if supp_skills_section:
  supp_skills_table = supp_skills_section.find_next("table",{"class": "table p5"})
  all_skills_list.extend(extract_skills_from_table(supp_skills_table, "Support", cost_index=2))

# Passive Skills
pass_skills_section = page_soup.find("span",{"id" : "Passive_Skills"})
if pass_skills_section:
  pass_skills_table = pass_skills_section.find_next("table",{"class": "table p5"})
  all_skills_list.extend(extract_skills_from_table(pass_skills_table, "Passive",cost_index=False))

df_skills = pd.DataFrame(all_skills_list, columns=["Skill", "Effect", "Cost", "Category"])
df_skills.loc[df_skills["Category"] == "Passive", "Cost"] = "-"

In [None]:
df_skills

Unnamed: 0,Skill,Effect,Cost,Category
0,Lunge,Light Physical damage to 1 foe.,5% HP,Physical
1,Cleave,Light Physical damage to 1 foe.,6% HP,Physical
2,Giant Slice,Medium Physical damage to 1 foe.,9% HP,Physical
3,Assault Dive,Heavy Physical damage to 1 foe.,13% HP,Physical
4,Megaton Raid,Severe Physical damage to 1 foe.,16% HP,Physical
...,...,...,...,...
189,Marakunda,Decrease all foes' Defense for 3 turns.,24 SP,Support
190,Masukunda,Decrease all foes' Agility for 3 turns.,24 SP,Support
191,Counter,10% chance of reflecting Physical attacks.,0,Passive
192,Counterstrike,15% chance of reflecting Physical attacks.,0,Passive


In [None]:
for failed_link in unaccessible_links:
        print(failed_link)

https://megamitensei.fandom.com/wiki/Arsene
https://megamitensei.fandom.com/wiki/High_Pixie
https://megamitensei.fandom.com/wiki/Orpheus_Picaro
https://megamitensei.fandom.com/wiki/Crystal_Skull
https://megamitensei.fandom.com/wiki/Koh-i-Noor
https://megamitensei.fandom.com/wiki/Milady
https://megamitensei.fandom.com/wiki/Queen's_Necklace
https://megamitensei.fandom.com/wiki/Regent
https://megamitensei.fandom.com/wiki/Phoenix
https://megamitensei.fandom.com/wiki/Bishamonten
https://megamitensei.fandom.com/wiki/Loki
https://megamitensei.fandom.com/wiki/Stone_of_Scone
https://megamitensei.fandom.com/wiki/Ariadne_Picaro
https://megamitensei.fandom.com/wiki/Asterius_Picaro
https://megamitensei.fandom.com/wiki/Orlov
https://megamitensei.fandom.com/wiki/Emperor's_Amulet
https://megamitensei.fandom.com/wiki/Hope_Diamond
https://megamitensei.fandom.com/wiki/Thanatos_Picaro
https://megamitensei.fandom.com/wiki/Andras
https://megamitensei.fandom.com/wiki/Magatsu-Izanagi_Picaro
https://megamitens

In [None]:
# # # Manually add persona stats from unaccessed links -- OTW
# # Arsene
# # URL of the Persona page

# page_url = "https://megamitensei.fandom.com/wiki/High_Pixie#Persona_5_Royal"

# # Open the page and parse
# uClient = uReq(page_url)
# page_soup = BeautifulSoup(uClient.read(), "html.parser")
# uClient.close()

# # Step 1: Find the Persona 5 section
# persona5_section = page_soup.find("span", {"id": "Persona_5_Royal"})

# # Extract the Persona name
# persona_span = page_soup.find("span", {"class": "mw-page-title-main"})
# persona_name = persona_span.text.strip() if persona_span else "Unknown"

# # Find the Arcana row (first <td> after <th> with title="Arcana")
# arcana_row = soup.find("td", style="background:#000;color:#fff")
# if arcana_row:
#   arcana_link = arcana_row.find("a")
#   arcana_value = arcana_link.text.strip() if arcana_link else arcana_row.text.strip()
# else:
#   arcana_value = "Unknown"

# # Step 2: Locate the next "customtable"
# if persona5_section:
#     persona5_table = persona5_section.find_next("table", {"class": "customtable"})

#     if persona5_table:
#         rows = persona5_table.find_all("tr")
#         data = {"Persona" : persona_name}

#         # Step 3: Extract Arcana and Level (first two rows)
#         arcana = rows[1].find_all("td")[0].text.strip()
#         level = rows[1].find_all("td")[1].text.strip()

#         # data["Arcana"] = arcana
#         # data["Level"] = level

#         # Step 4: Extract stats from inner table
#         stat_table = rows[0].find("table")  # The inner table containing stats
#         if stat_table:
#             for row in stat_table.find_all("tr"):
#                 cols = row.find_all("td")
#                 if len(cols) >= 2:
#                     stat_name = cols[0].text.strip()
#                     stat_value = cols[1].text.strip()
#                     data[stat_name] = stat_value

#         # Step 5: Convert to DataFrame
#         df_cek = pd.DataFrame([data])
#         print(df_cek)