In this notebook we'll be transforming our xml files into parquet files so that it becomes easier to work on them

In [8]:
import pandas as pd
import kagglehub
import xmltodict, pandas as pd

from pathlib import Path
ROOT = Path.cwd().parent

In [2]:
def to_int(x):
    try:
        return int(float(x))
    except (TypeError, ValueError):
        return None

def to_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return None

def split_pylist_string(s):
    """
    Turn "['Action', 'Shooter', 'RPG']" -> ['Action','Shooter','RPG'].
    Also handles duplicates and weird spacing/quotes.
    """
    if not isinstance(s, str):
        return []
    s = s.strip()
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1]
    parts = [p.strip().strip("'").strip('"') for p in s.split(",")]
    # dedupe while preserving order
    seen, out = set(), []
    for p in parts:
        if p and p not in seen:
            seen.add(p)
            out.append(p)
    return out

def normalize_genres(g):
    """
    Works for:
      - <genres><genre>['Action','Shooter']</genre></genres>  (your case)
      - <genres><genre>Action</genre><genre>Shooter</genre></genres>
    """
    genres_node = g.get("genres")
    if not genres_node:
        return []
    items = genres_node.get("genre", [])
    # force_list makes 'genre' a list; if not, wrap
    if not isinstance(items, list):
        items = [items]

    out = []
    for it in items:
        if isinstance(it, dict):
            # rare: {'#text': 'Action'}
            it = it.get("#text", "")
        if isinstance(it, str) and it.strip().startswith("["):
            out.extend(split_pylist_string(it))
        elif isinstance(it, str):
            val = it.strip()
            if val:
                out.append(val)
    # final dedupe, preserve order
    seen, final = set(), []
    for v in out:
        if v not in seen:
            seen.add(v)
            final.append(v)
    return final

In [None]:
with open("raw/df_metacritic_updated.xml") as f:
    doc = xmltodict.parse(f.read(), force_list=("game","genre"))

rows = []
for g in doc["video_games"]["game"]:
    rows.append({
        "title":        g.get("title"),
        "platform":     g.get("platform"),
        "release_year": to_int(g.get("release_year")),
        "developer":    g.get("developer"),
        "genres":       normalize_genres(g),
        "critic_score": to_int(g.get("critic_score")),
        "user_score":   to_float(g.get("user_score")),
        "esrb_rating":  g.get("esrb_rating"),
    })

df_metacritic_xml = pd.DataFrame(rows)
#########################################################################
with open("raw/df_videogamesales_latest.xml") as f:
    doc = xmltodict.parse(f.read(), force_list=("game","genre"))

rows = []
for g in doc["video_games"]["game"]:
    rows.append({
        # VGSales columns
        "title": g.get("title"),
        "platform": g.get("platform"),
        "release_year": to_int(g.get("release_year")),
        "publisher": g.get("publisher"),

        # genres (list[str] thanks to force_list=("game","genre"))
        "genres": g.get("genres", {}).get("genre", []),

        # regional/global sales (millions)
        "na_sales_mil":     to_float(g.get("na_sales_mil")),
        "eu_sales_mil":     to_float(g.get("eu_sales_mil")),
        "jp_sales_mil":     to_float(g.get("jp_sales_mil")),
        "other_sales_mil":  to_float(g.get("other_sales_mil")),
        "global_sales_mil": to_float(g.get("global_sales_mil")),
    })
df_videogamesales_xml = pd.DataFrame(rows)
#########################################################################
with open("raw/df_playtime.xml") as f:
    doc = xmltodict.parse(f.read(), force_list=("game","genre"))

rows = []
for g in doc["video_games"]["game"]:
    rows.append({
        # Common metadata
        "title": g.get("title"),
        "platform": g.get("platform"),
        "release_year": to_int(g.get("release_year")),
        "developer": g.get("developer"),
        "publisher": g.get("publisher"),

        # Genres as list[str]
        "genres":       normalize_genres(g),

        # Time-to-beat (hours)
        "main_story_hour":       to_float(g.get("main_story_hour")),
        "main_plus_sides_hour":  to_float(g.get("main_plus_sides_hour")),
        "completionist_hour":    to_float(g.get("completionist_hour")),
    })

df_playtime_xml = pd.DataFrame(rows)



In [12]:
# Make a directory called parquet if it doesn't exist
import os
if not os.path.exists(ROOT/"parquet"):
    os.mkdir(ROOT/"parquet")
df_metacritic_xml.to_parquet(ROOT/"parquet/df_metacritic.parquet")
df_videogamesales_xml.to_parquet(ROOT/"parquet/df_videogamesales.parquet")
df_playtime_xml.to_parquet(ROOT/"parquet/df_playtime.parquet")