In [1]:
#import and setup
import os
import sys

import pandas as pd
import requests
from bs4 import BeautifulSoup

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.wiki_scraper import HEADERS
from src.theme_classifiers import ThemesClassifier


In [2]:
from urllib.parse import urlparse


def snake_case(name: str) -> str:

    import re

    name = name.lower()
    name = re.sub(r"[^a-z0-9]+", "_", name)
    name = re.sub(r"_+", "_", name)
    return name.strip("_")


def scrape_page(url: str):
   #fetch and parse wiki page
    if not isinstance(url, str) or not url.startswith("http"):
        return "", "", "", ""

    try:
        resp = requests.get(url, headers=HEADERS, timeout=20)
    except Exception:
        return "", "", "", ""

    if resp.status_code != 200:
        return "", "", "", ""

    html = resp.text
    soup = BeautifulSoup(html, "lxml")

    #infobox extraction
    info = {}
    infobox = soup.find("table", class_="infobox")
    if infobox:
        for row in infobox.find_all("tr"):
            if row.th and row.td:
                key = row.th.get_text(" ", strip=True)
                value = row.td.get_text(" ", strip=True)
                info[key] = value

    def get_field(possible_keys):
        for k in possible_keys:
            if k in info:
                return info[k]
        return ""

    developers = get_field(["Developer(s)", "Developer", "Developers"])
    release_date = get_field(["Release date", "Released", "Release"])
    platforms = get_field(["Platform(s)", "Platforms", "Platform"])

    #full text (paragraphs only, no tables/citations)
    for t in soup.find_all("table"):
        t.decompose()
    
    for s in soup.find_all("sup"):
        s.decompose()

    paragraphs = soup.find_all("p")
    full_text = " ".join(p.get_text(" ", strip=True) for p in paragraphs)

    return full_text, developers, release_date, platforms


In [8]:
#import curated CSV
csv_path = "../data/raw/with_female_protagonists_curated.csv"

df = pd.read_csv(csv_path)
print("Loaded rows:", len(df))
df.head()


Loaded rows: 93


Unnamed: 0,title,wiki_page
0,A Plague Tale: Innocence,en.wikipedia.org/wiki/A_Plague_Tale:_Innocence
1,Alice: Madness Returns,https://en.wikipedia.org/wiki/Alice%3A_Madness...
2,Alien: Isolation,https://en.wikipedia.org/wiki/Alien:_Isolation
3,American McGee's Alice,https://en.wikipedia.org/wiki/American_McGee%2...
4,Amnesia: Rebirth,en.wikipedia.org/wiki/Amnesia:_Rebirth


In [4]:
#theme classifier setup
classifier = ThemesClassifier()
THEME_KEYWORDS = classifier.keywords  
theme_names = list(THEME_KEYWORDS.keys())
theme_to_col = {theme: snake_case(theme) for theme in theme_names}




In [5]:
#scraped fields
full_texts = []
developers_list = []
release_dates = []
platforms_list = []


theme_truefalse = {col: [] for col in theme_to_col.values()}
theme_keywords = {col + "_keywords": [] for col in theme_to_col.values()}


for idx, row in df.iterrows():
    url = row['wiki_page']
    
    full_text, devs, rel, plats = scrape_page(url)
    
    full_texts.append(full_text)
    developers_list.append(devs)
    release_dates.append(rel)
    platforms_list.append(plats)

    text_lower = full_text.lower() if isinstance(full_text, str) else ""

    
    for theme, kw_list in THEME_KEYWORDS.items():
        col = theme_to_col[theme]
        col_kw = col + "_keywords"

        found = set()

        for kw in kw_list:
            kw_lower = kw.lower()
            if kw_lower in text_lower:
                found.add(kw_lower)  # keep lowercase

        if found:
            theme_truefalse[col].append(True)
           
            theme_keywords[col_kw].append("; ".join(sorted(found)))
        else:
            theme_truefalse[col].append(False)
            theme_keywords[col_kw].append("")





In [6]:
#add fields to dataframe
df["full_text"] = full_texts
df["developers"] = developers_list
df["release_date"] = release_dates
df["platforms"] = platforms_list

for col, values in theme_truefalse.items():
    df[col] = values

for col, values in theme_keywords.items():
    df[col] = values

df.head()


Unnamed: 0,title,wiki_page,full_text,developers,release_date,platforms,motherhood,domesticity,trauma_and_mental_illness,embodiment,...,motherhood_keywords,domesticity_keywords,trauma_and_mental_illness_keywords,embodiment_keywords,captivity_keywords,violence_keywords,sexualized_violence_keywords,female_monstrosity_keywords,girlhood_horror_keywords,queer_themes_keywords
0,A Plague Tale: Innocence,en.wikipedia.org/wiki/A_Plague_Tale:_Innocence,,,,,False,False,False,False,...,,,,,,,,,,
1,Alice: Madness Returns,https://en.wikipedia.org/wiki/Alice%3A_Madness...,Alice: Madness Returns is a 2011 action-adven...,Spicy Horse,"NA : June 14, 2011 [ 2 ] EU : June 16, 2011 [ ...",Windows PlayStation 3 Xbox 360,False,True,True,True,...,,family,asylum; hallucinate; hallucination; trauma,memories,escape; locked; prison,kill; rape; raped,forced; rape; raped,creature; evil; witch,young,
2,Alien: Isolation,https://en.wikipedia.org/wiki/Alien:_Isolation,Alien: Isolation is a 2014 survival horror ga...,Creative Assembly,"7 October 2014 PS3 , PS4 , Win , X360 , XOne 7...",PlayStation 3 PlayStation 4 Windows Xbox 360 X...,True,True,True,False,...,birth; daughter; mother,wife,trauma,,escape; imprisoned; prison,kill; killed,,creature; monster; witch,girl; teen,
3,American McGee's Alice,https://en.wikipedia.org/wiki/American_McGee%2...,American McGee's Alice is a 2000 third-person...,Rogue Entertainment [ a ],"Windows NA : December 5, 2000 [ 1 ] [ 2 ] Mac ...",Windows Mac OS,True,True,True,True,...,baby; labor,family; father,asylum; hallucinate; trauma,disfigured; flesh,captive,kill; killed; murder; violence; violent,,creature,,
4,Amnesia: Rebirth,en.wikipedia.org/wiki/Amnesia:_Rebirth,,,,,False,False,False,False,...,,,,,,,,,,


In [None]:
#export to CSV
output_path = "../data/raw/feminist_themes_raw.csv"
df.to_csv(output_path, index=False)
output_path


'../data/horror_games_feminist_themes_scraped_raw2.csv'