In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag
import requests

In [2]:
def extract_info_fandom(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    infobox = soup.find(class_=lambda class_name: class_name and 'infobox' in class_name)
    return infobox

def extract_grouped_text(element):
    if isinstance(element, NavigableString):
        return element.strip()

    if not element.contents:
        return []

    grouped_text = []
    current_text = []

    for child in element.contents:
        if isinstance(child, NavigableString):
            text = child.strip()
            if text:
                current_text.append(text)
        elif isinstance(child, Tag):
            if child.name == 'br':
                pass
                # current_text.append(' ')  # Add space instead of creating a new entry
            else:
                if current_text:
                    grouped_text.append(' '.join(current_text))
                    current_text = []
                child_group = extract_grouped_text(child)
                if child_group:
                    grouped_text.append(child_group)

    if current_text:
        grouped_text.append(' '.join(current_text))

    if len(grouped_text) == 1:
        return grouped_text[0]
    return grouped_text

def strip_list(l):
    for i in range(len(l)):
        while len(l[i]) == 1 and isinstance(l[i], list):
            l[i] = l[i][0]
        if isinstance(l[i], list):
            l[i] = strip_list(l[i])
    return l

# Assume that first element is alway key (string)
# List may be of length 2 or more
# If length 2 than second item is value
# Otherwise we combine everything after 1st element as a list 
# In both cases recursively apply grouped_to_dict to all elements
def grouped_to_dict(clear_groups):
    # print(clear_groups)
    res = {}
    key = clear_groups[0]
    while not isinstance(key, str):
        key = key[0]
    if len(clear_groups) == 2 and isinstance(clear_groups[1], str):
        res[key] = clear_groups[1]
    elif len(clear_groups) == 2:
        res[key] = grouped_to_dict(clear_groups[1])
    elif len(clear_groups) > 2 and isinstance(clear_groups[1], str):
        res[key] = grouped_to_dict(clear_groups[1:])
    else:
        value = []
        for i in range(1, len(clear_groups)):
            value.append(grouped_to_dict(clear_groups[i]))
        # print(key)
        res[key] = value
    return res

In [1]:
import json

my_list = [1, 2, 3, 4, 5]
with open('my_list.json', 'w') as f:
    json.dump(my_list, f)

with open('my_list.json', 'r') as f:
    loaded_list = json.load(f)


In [3]:
def extract_infobox(page_url):
    data_dict = None
    infobox = extract_info_fandom(page_url)
    if infobox:
        grouped_text = extract_grouped_text(infobox)
        if grouped_text:
            clear_groups = strip_list(grouped_text)
            if clear_groups:
                data_dict = grouped_to_dict(clear_groups)
    return data_dict

In [4]:
def extract_data_fandom(game_head):
    head_url = game_head
    all_pages_tail = 'wiki/Special:AllPages'
    all_pages_url = head_url + all_pages_tail
    response = requests.get(all_pages_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    all_pages = soup.find('ul', {'class': 'mw-allpages-chunk'})
    pages = all_pages.find_all('li')
    pages_urls = []
    for page in pages:
        tail = page.find('a').get('href')
        url = head_url + tail
        pages_urls.append(url)
    
    data = []

    for page_url in pages_urls:
        d = extract_infobox(page_url)
        if d:
            data.append(d)
    
    return data

In [5]:
import pandas as pd

df = pd.read_csv('fandom_en_links.csv')

In [6]:
df = df.drop('Unnamed: 0', axis=1)

In [7]:
df['fandom_url'] = df['fandom_url'].apply(lambda url: url[:-5])

In [8]:
import re 

def dataset_name(game_name):
    return re.sub('[^0-9a-zA-Z]+', '', game_name)[:63].lower() + '.json'

df['file_path'] = df['Title'].apply(dataset_name)

In [9]:
df

Unnamed: 0,fandom_url,Title,file_path
0,https://GrandTheftAutoV.fandom.com/,Grand Theft Auto V,grandtheftautov.json
1,https://BaldursGate3.fandom.com/,Baldur's Gate 3,baldursgate3.json
2,https://EldenRing.fandom.com/,Elden Ring,eldenring.json
3,https://TheLastofUs.fandom.com/,The Last of Us,thelastofus.json
4,https://RedDeadRedemption.fandom.com/,Red Dead Redemption,reddeadredemption.json
...,...,...,...
1585,https://XRebirth.fandom.com/,X Rebirth,xrebirth.json
1586,https://Magus.fandom.com/,Magus,magus.json
1587,https://Ghostbusters.fandom.com/,Ghostbusters,ghostbusters.json
1588,https://WildWestOnline.fandom.com/,Wild West Online,wildwestonline.json


In [10]:
game_headers, game_paths = df['fandom_url'].to_list(), df['file_path'].to_list()

In [None]:
import json

for game_header, file_path in zip(game_headers, game_paths):
    try:
        data = []
        data = extract_data_fandom(game_head=game_header)
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file)
        print(f"Saved data for {game_header} in path {file_path}")
    except:
        with open('failed_scraps.txt', 'a') as logs:
            logs.write(game_header)
        print(f"Failed to collect data for {game_header} in path {file_path}")

Saved data for https://GrandTheftAutoV.fandom.com/ in path grandtheftautov.json
Saved data for https://BaldursGate3.fandom.com/ in path baldursgate3.json
Saved data for https://EldenRing.fandom.com/ in path eldenring.json
Saved data for https://TheLastofUs.fandom.com/ in path thelastofus.json
Saved data for https://RedDeadRedemption.fandom.com/ in path reddeadredemption.json
Saved data for https://GodofWar.fandom.com/ in path godofwar.json
Saved data for https://AstroBot.fandom.com/ in path astrobot.json
Saved data for https://TheLegendofZeldaSkywardSword.fandom.com/ in path thelegendofzeldaskywardsword.json
Saved data for https://Hades.fandom.com/ in path hades.json
Saved data for https://Uncharted4AThiefsEnd.fandom.com/ in path uncharted4athiefsend.json
Saved data for https://SuperSmashBrosUltimate.fandom.com/ in path supersmashbrosultimate.json
Saved data for https://INSIDE.fandom.com/ in path inside.json
Saved data for https://Uncharted3DrakesDeception.fandom.com/ in path uncharted