In [7]:
#import and set up
import os
import sys
import pandas as pd


project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)


from src.wiki_scraper import get_horror_games_with_subgenres, get_full_page_text_from_url, title_to_url
from src.female_protagonists_classifiers import FemaleProtagonistClassifier

classifier = FemaleProtagonistClassifier()

In [8]:
#scrape wiki horror video games and its subcategories
games = get_horror_games_with_subgenres()
len(games)




2885

In [9]:
#create dataframe
rows = []

for title, info in games.items():
    rows.append({
        "title": title,
        "wiki_page": title_to_url(title),
        "category": "; ".join(info["subgenres"]) if info["subgenres"] else None
    })

df = pd.DataFrame(rows)
df.head()


Unnamed: 0,title,wiki_page,category
0,Horror game,https://en.wikipedia.org/wiki/Horror_game,
1,List of horror games,https://en.wikipedia.org/wiki/List_of_horror_g...,
2,Aaahh!!! Real Monsters (video game),https://en.wikipedia.org/wiki/Aaahh%21%21%21_R...,1990s horror video games
3,Anatomy (video game),https://en.wikipedia.org/wiki/Anatomy_%28video...,
4,Are You Afraid of the Dark? The Tale of Orpheo...,https://en.wikipedia.org/wiki/Are_You_Afraid_o...,1990s horror video games


In [10]:
#scrape for possible female protagonists
from src.female_protagonists_classifiers import FemaleProtagonistClassifier

classifier = FemaleProtagonistClassifier()

has_fp = []
evidence_type = []
evidence_sentence = []

for idx, row in df.iterrows():

    title = row["title"]
    url   = row["wiki_page"]

   
    progress_num = idx + 1
    total = len(df)
    print(f"\n[{progress_num}/{total}] Scraping: {title}")
    print("URL:", url)

   
    full_text = get_full_page_text_from_url(url)   # or get_full_page_text(title)
    print("Text length:", len(full_text))

    if not full_text or len(full_text) < 200:
        print("Not enough plot text, skip.")
        has_fp.append(False)
        evidence_type.append(None)
        evidence_sentence.append(None)
        continue

    result = classifier.classify(full_text)

    has_fp.append(result.has_female_protagonist)
    evidence_type.append(result.evidence_type)
    evidence_sentence.append(result.evidence_sentence)

df["has_female_protagonist"] = has_fp
df["evidence_type"] = evidence_type
df["evidence_sentence"] = evidence_sentence



[1/2885] Scraping: Horror game
URL: https://en.wikipedia.org/wiki/Horror_game
Text length: 12048

[2/2885] Scraping: List of horror games
URL: https://en.wikipedia.org/wiki/List_of_horror_games
Text length: 115
⚠️ Not enough plot text—skipping.

[3/2885] Scraping: Aaahh!!! Real Monsters (video game)
URL: https://en.wikipedia.org/wiki/Aaahh%21%21%21_Real_Monsters_%28video_game%29
Text length: 2092

[4/2885] Scraping: Anatomy (video game)
URL: https://en.wikipedia.org/wiki/Anatomy_%28video_game%29
Text length: 758

[5/2885] Scraping: Are You Afraid of the Dark? The Tale of Orpheo's Curse
URL: https://en.wikipedia.org/wiki/Are_You_Afraid_of_the_Dark%3F_The_Tale_of_Orpheo%27s_Curse
Text length: 2226

[6/2885] Scraping: Blood of the Werewolf
URL: https://en.wikipedia.org/wiki/Blood_of_the_Werewolf
Text length: 1045

[7/2885] Scraping: BloodRayne (video game)
URL: https://en.wikipedia.org/wiki/BloodRayne_%28video_game%29
Text length: 5237

[8/2885] Scraping: Bloom (mod)
URL: https://en.wiki

In [None]:
output_path = "../data/raw/with_female_protagonists_raw.csv"

df.to_csv(output_path, index=False, encoding="utf-8")
print("Saved CSV →", output_path)


Saved CSV → ../data/with_female_protagonists_raw.csv
