In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
def extract_info_fandom(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    infobox = soup.find(class_=lambda class_name: class_name and 'infobox' in class_name)
    return infobox

from bs4 import NavigableString, Tag

from bs4 import NavigableString, Tag

def extract_grouped_text(element):
    if isinstance(element, NavigableString):
        return element.strip()

    if not element.contents:
        return []

    grouped_text = []
    current_text = []

    for child in element.contents:
        if isinstance(child, NavigableString):
            text = child.strip()
            if text:
                current_text.append(text)
        elif isinstance(child, Tag):
            if child.name == 'br':
                pass
                # current_text.append(' ')  # Add space instead of creating a new entry
            else:
                if current_text:
                    grouped_text.append(' '.join(current_text))
                    current_text = []
                child_group = extract_grouped_text(child)
                if child_group:
                    grouped_text.append(child_group)

    if current_text:
        grouped_text.append(' '.join(current_text))

    if len(grouped_text) == 1:
        return grouped_text[0]
    return grouped_text

def strip_list(l):
    for i in range(len(l)):
        while len(l[i]) == 1 and isinstance(l[i], list):
            l[i] = l[i][0]
        if isinstance(l[i], list):
            l[i] = strip_list(l[i])
    return l

# Assume that first element is alway key (string)
# List may be of length 2 or more
# If length 2 than second item is value
# Otherwise we combine everything after 1st element as a list 
# In both cases recursively apply grouped_to_dict to all elements
def grouped_to_dict(clear_groups):
    # print(clear_groups)
    res = {}
    key = clear_groups[0]
    while not isinstance(key, str):
        key = key[0]
    if len(clear_groups) == 2 and isinstance(clear_groups[1], str):
        res[key] = clear_groups[1]
    elif len(clear_groups) == 2:
        res[key] = grouped_to_dict(clear_groups[1])
    elif len(clear_groups) > 2 and isinstance(clear_groups[1], str):
        res[key] = grouped_to_dict(clear_groups[1:])
    else:
        value = []
        for i in range(1, len(clear_groups)):
            value.append(grouped_to_dict(clear_groups[i]))
        # print(key)
        res[key] = value
    return res

In [3]:
def extract_infobox(page_url):
    data_dict = None
    infobox = extract_info_fandom(page_url)
    if infobox:
        grouped_text = extract_grouped_text(infobox)
        if grouped_text:
            clear_groups = strip_list(grouped_text)
            if clear_groups:
                data_dict = grouped_to_dict(clear_groups)
    return data_dict

In [4]:
extract_infobox('https://eldenring.fandom.com//wiki/%22Champion%27s_Song%22_Painting')

{'"Champion\'s Song" Painting': {'絵画「英雄の歌」': {'Basic Information': [{'Type': 'Info Item'},
    {'Category': 'Painting'},
    {'Max. Held': '1'},
    {'Item Effect': 'Reminiscence of the painting "Champion\'s Song"'},
    {'Online Trade': 'No'}]}}}

In [5]:
def extract_data_fandom(game_head, data_collector):
    head_url = game_head
    all_pages_tail = 'wiki/Special:AllPages'
    all_pages_url = head_url + all_pages_tail
    response = requests.get(all_pages_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    all_pages = soup.find('ul', {'class': 'mw-allpages-chunk'})
    pages = all_pages.find_all('li')
    pages_urls = []
    for page in pages:
        tail = page.find('a').get('href')
        url = head_url + tail
        pages_urls.append(url)
    
    for page_url in pages_urls:
        print(page_url)
        data = extract_infobox(page_url)
        if data:
            data_collector.append(data)

In [6]:
data_er = []
extract_data_fandom('https://eldenring.fandom.com/', data_er)

https://eldenring.fandom.com//wiki/%22Champion%27s_Song%22_Painting
https://eldenring.fandom.com//wiki/%22Flightless_Bird%22_Painting
https://eldenring.fandom.com//wiki/%22Homing_Instinct%22_Painting
https://eldenring.fandom.com//wiki/%22Prophecy%22_Painting
https://eldenring.fandom.com//wiki/%22Redmane%22_Painting
https://eldenring.fandom.com//wiki/%22Resurrection%22_Painting
https://eldenring.fandom.com//wiki/%22Sorcerer%22_Painting
https://eldenring.fandom.com//wiki/1.00
https://eldenring.fandom.com//wiki/1.02
https://eldenring.fandom.com//wiki/1.03
https://eldenring.fandom.com//wiki/1.04
https://eldenring.fandom.com//wiki/1.05
https://eldenring.fandom.com//wiki/1.06
https://eldenring.fandom.com//wiki/1.07
https://eldenring.fandom.com//wiki/1.08
https://eldenring.fandom.com//wiki/1.09
https://eldenring.fandom.com//wiki/1.10
https://eldenring.fandom.com//wiki/ALS_(Alabaster_Lord%27s_Sword)
https://eldenring.fandom.com//wiki/AR_(Antspur_Rapier)
https://eldenring.fandom.com//wiki/Aband

In [8]:
import json

with open('eldenring.json', 'w') as json_file:
    json.dump(data_er, json_file)

print("JSON file has been saved successfully.")

JSON file has been saved successfully.


In [14]:
with open('eldenring.json', 'r') as json_file:
    data_loaded = json.load(json_file)

print("Data loaded from JSON file:")
type(data_loaded)

Data loaded from JSON file:


list