In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag
import requests

In [2]:
url = 'https://eldenring.fandom.com'

In [3]:
import re

def get_infobox(response):
    soup = BeautifulSoup(response.content, 'lxml')
    infobox = soup.find(class_=lambda class_name: class_name and 'infobox' in class_name)
    if not infobox:
        return None

    for tag in infobox.find_all(True):  # True finds all tags
        if tag.get_text(strip=True) != "":
            tag.attrs = {}
            for attribute in ["class", "id", "style", 'href']:
                del tag[attribute]
        else:
            tag.decompose()

    return str(infobox)

def next_page(nav_bar):
    if nav_bar is None:
        return None
    a = nav_bar.a
    next_page = None
    while a:
        if a.get_text(strip=True)[:4] == "Next":
            next_page = a.get('href')
            break
        else:
            a = a.next_sibling
    return next_page


def page_infoboxes_urls(game_head, page_tail, urls_set: set, page_limit=100):
    url = game_head + page_tail
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')

    nav_bar = soup.find('div', {'class': 'mw-allpages-nav'})
    next_page_tail = next_page(nav_bar)

    all_pages = soup.find('ul', {'class': 'mw-allpages-chunk'})
    pages = all_pages.find_all('li')
    
    infoboxes = []
    urls = []

    version = r'\d+\.\d+'
    for page in pages:
        tail = page.find('a').get('href')
        if not re.search(version, tail):
            url = game_head + tail
            response = requests.get(url)
            url = response.url
            if url not in urls_set:
                infobox = get_infobox(response=response)
                if infobox:
                    infoboxes.append(infobox), urls.append(url), urls_set.add(url)
                    page_limit -= 1
        if page_limit == 0:
            next_page_tail = None
            break
    
    return (infoboxes, urls, urls_set), next_page_tail

def process_all_pages(game_head, page_limit_per_page=20, max_pages=5):
    current_page_tail = '/wiki/Special:AllPages'
    infoboxes, urls, urls_set = [], [], set()
    while current_page_tail and max_pages > 0:
        tmp, next_tail = page_infoboxes_urls(game_head, current_page_tail, urls_set, page_limit=page_limit_per_page)
        infoboxes += tmp[0]
        urls += tmp[1]
        current_page_tail = next_tail
        max_pages -= 1
    return infoboxes, urls

In [4]:
import pandas as pd

df = pd.read_csv('fandom_en_links.csv')

In [5]:
df = df.drop('Unnamed: 0', axis=1)

In [6]:
df['fandom_url'] = df['fandom_url'].apply(lambda url: url[:-6])

In [7]:
import re 

def dataset_name(game_name):
    return re.sub('[^0-9a-zA-Z]+', '', game_name)[:63].lower() + '.json'

df['file_path'] = df['Title'].apply(dataset_name)

In [8]:
df

Unnamed: 0,fandom_url,Title,file_path
0,https://GrandTheftAutoV.fandom.com,Grand Theft Auto V,grandtheftautov.json
1,https://BaldursGate3.fandom.com,Baldur's Gate 3,baldursgate3.json
2,https://EldenRing.fandom.com,Elden Ring,eldenring.json
3,https://TheLastofUs.fandom.com,The Last of Us,thelastofus.json
4,https://RedDeadRedemption.fandom.com,Red Dead Redemption,reddeadredemption.json
...,...,...,...
1585,https://XRebirth.fandom.com,X Rebirth,xrebirth.json
1586,https://Magus.fandom.com,Magus,magus.json
1587,https://Ghostbusters.fandom.com,Ghostbusters,ghostbusters.json
1588,https://WildWestOnline.fandom.com,Wild West Online,wildwestonline.json


In [9]:
game_headers, game_paths = df['fandom_url'].to_list(), df['file_path'].to_list()

In [None]:
import json

folder = './fandom_data/'
for game_header, file_path in zip(game_headers[710:], game_paths[710:]):
    try:
        data = []
        data = process_all_pages(game_head=game_header)
        with open(folder + file_path, 'w') as json_file:
            json.dump(data, json_file)
        print(f"Saved data for {game_header} in path {file_path}")
    except:
        with open(folder + 'failed_scraps.txt', 'a') as logs:
            logs.write(game_header)
        print(f"Failed to collect data for {game_header} in path {file_path}")

Saved data for https://Pikuniku.fandom.com in path pikuniku.json
Saved data for https://SurvivingMars.fandom.com in path survivingmars.json
Saved data for https://Hindsight.fandom.com in path hindsight.json
Saved data for https://FieldofGloryEmpires.fandom.com in path fieldofgloryempires.json
Saved data for https://CrimsonShroud.fandom.com in path crimsonshroud.json
Saved data for https://Gigantic.fandom.com in path gigantic.json
Saved data for https://JYDGE.fandom.com in path jydge.json
Saved data for https://StoryofSeasons.fandom.com in path storyofseasons.json
Saved data for https://EnGarde.fandom.com in path engarde.json
Saved data for https://Evolve.fandom.com in path evolve.json
Saved data for https://CoffeeTalk.fandom.com in path coffeetalk.json
Saved data for https://DinsCurse.fandom.com in path dinscurse.json
Saved data for https://HyruleWarriors.fandom.com in path hyrulewarriors.json
Saved data for https://DISTRAINT.fandom.com in path distraint.json
Saved data for https://Cra