## Rock Music Network


#### *Step 1: Get a clean list of page titles*

In [4]:
import re
from pathlib import Path

PDF_PATH = "rock_artists.pdf"
TXT_EXPORT = "rock_artists_raw.txt"
OUT_LIST = "rock_artists.txt"

def pdf_to_text(pdf_path, txt_path):
    import PyPDF2
    with open(pdf_path, "rb") as f, open(txt_path, "w", encoding="utf-8") as out:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            out.write(page.extract_text() or "")
            out.write("\n")

def parse_titles(text):
    titles = set()

    # Preferred: wiki-style [[Title]] or [[Title|display]]
    for raw in re.findall(r"\[\[([^\[\]]+)\]\]", text):
        left = raw.split("|", 1)[0].strip()
        if not left or ":" in left:  # skip namespaces like Category:
            continue
        titles.add(left)

    if not titles:
        # Fallback: lines that look like Proper Noun titles (very heuristic)
        for line in text.splitlines():
            line = line.strip()
            if 2 <= len(line) <= 120 and line[0].isupper() and not line.endswith(":"):
                # crude filter to avoid headers; adjust as needed
                titles.add(line)

    # Normalize: keep canonical version with spaces; downloader will add underscores
    clean = []
    seen = set()
    for t in titles:
        t = re.sub(r"\s+", " ", t).strip()
        if t and t not in seen:
            seen.add(t)
            clean.append(t)
    clean.sort()
    return clean

def main():
    if not Path(PDF_PATH).exists():
        raise SystemExit(f"PDF not found: {PDF_PATH}. Put your PDF next to this script.")

    print("Extracting text from PDF...")
    pdf_to_text(PDF_PATH, TXT_EXPORT)
    text = Path(TXT_EXPORT).read_text(encoding="utf-8", errors="ignore")

    print("Parsing titles...")
    titles = parse_titles(text)
    if not titles:
        raise SystemExit("Found 0 titles. Open rock_artists_raw.txt and check the formatting.")

    Path(OUT_LIST).write_text("\n".join(titles), encoding="utf-8")
    print(f"Saved {len(titles)} titles to {OUT_LIST}")

if __name__ == "__main__":
    main()


Extracting text from PDF...
Parsing titles...
Saved 489 titles to rock_artists.txt


#### *Step 2: Fetch raw Wikipedia wikitext for each title*

In [5]:
import os, time, json, pathlib, requests
from urllib.parse import quote

INPUT_LIST = "rock_artists.txt"      # one title per line
OUT_DIR     = "pages_raw_wikitext"   # output folder
SLEEP_SEC   = 0.2                    # gentle rate limit

API = "https://en.wikipedia.org/w/api.php"

# MediaWiki lets up to ~50 titles per request for 'query'
BATCH_SIZE = 50

def safe_filename(name: str, maxlen: int = 150) -> str:
    """
    Make a Windows-safe filename from a page title.
    Replaces / \ : * ? " < > | with underscores, trims spaces/dots.
    """
    name = re.sub(r'[\\/:*?"<>|]+', "_", name)
    name = name.strip(" .")
    if len(name) > maxlen:
        name = name[:maxlen]
    return name

def to_api_title(title):
    return title.strip().replace(" ", "_")

def load_titles(path):
    with open(path, encoding="utf-8") as f:
        titles = [line.strip() for line in f if line.strip()]
    # Deduplicate but preserve order
    seen, clean = set(), []
    for t in titles:
        norm = to_api_title(t)
        if norm not in seen:
            clean.append(norm)
            seen.add(norm)
    return clean

def batched(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def fetch_wikitext_batch(titles):
    """
    Fetch wikitext via action=query&prop=revisions&rvslots=main&rvprop=content
    Handles redirects. Returns dict: {normalized_title: {"title":..., "pageid":..., "wikitext":...}}
    """
    params = {
        "action": "query",
        "format": "json",
        "redirects": 1,
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "content",
        "formatversion": 2,
        "titles": "|".join(titles),
    }
    headers = {
        "User-Agent": "RockNetworkBot/0.1 (https://github.com/yourusername; your_email@example.com)"
    }
    r = requests.get(API, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    ...


    # Build normalization/redirect mapping from response (if present)
    title_map = {t: t for t in titles}
    for norm in data.get("query", {}).get("normalized", []) or []:
        title_map[norm["from"].replace(" ", "_")] = norm["to"].replace(" ", "_")
    for redir in data.get("query", {}).get("redirects", []) or []:
        title_map[redir["from"].replace(" ", "_")] = redir["to"].replace(" ", "_")

    out = {}
    for page in data.get("query", {}).get("pages", []):
        title = page.get("title", "").replace(" ", "_")
        pageid = page.get("pageid", None)
        revs = page.get("revisions", [])
        wikitext = ""
        if revs and "slots" in revs[0] and "main" in revs[0]["slots"]:
            wikitext = revs[0]["slots"]["main"].get("content", "")

        out[title] = {
            "title": title,
            "pageid": pageid,
            "wikitext": wikitext,
            "missing": page.get("missing", False)
        }
    return out

def main():
    titles = load_titles(INPUT_LIST)
    os.makedirs(OUT_DIR, exist_ok=True)

    for chunk in batched(titles, BATCH_SIZE):
        try:
            data = fetch_wikitext_batch(chunk)
        except Exception as e:
            print("Batch failed:", e)
            time.sleep(2)
            continue

        for title in chunk:
            # Resolve final title if API redirected/normalized it
            # We’ll choose the returned key if present
            # Fall back to the original requested title
            saved = None
            # Prefer exact returned match if found
            for k in (title,) + tuple(data.keys()):
                if k in data:
                    saved = data[k]
                    break
            if not saved:
                # Nothing came back—skip
                print(f"[WARN] No data for {title}")
                continue

            final_title = saved["title"] or title
            safe_title  = safe_filename(final_title)   # sanitize it
            fname = f"{safe_title}.txt"
            fpath = pathlib.Path(OUT_DIR) / fname

            # Save raw wikitext; also drop a tiny JSON sidecar with metadata
            with open(fpath, "w", encoding="utf-8") as f:
                f.write(saved.get("wikitext", ""))

            meta = {
                "requested_title": title,
                "final_title": final_title,
                "pageid": saved.get("pageid"),
                "missing": saved.get("missing", False),
                "bytes": len(saved.get("wikitext", "")),
            }
            with open(str(fpath) + ".json", "w", encoding="utf-8") as jf:
                json.dump(meta, jf, ensure_ascii=False, indent=2)

            print(f"Saved: {fname} ({meta['bytes']} bytes)")
        time.sleep(SLEEP_SEC)

if __name__ == "__main__":
    main()


Saved: 10_Years_(band).txt (28465 bytes)
Saved: 10cc.txt (67283 bytes)
Saved: 3_Doors_Down.txt (46731 bytes)
Saved: 311_(band).txt (55903 bytes)
Saved: 38_Special_(band).txt (29277 bytes)
Saved: A_Perfect_Circle.txt (89175 bytes)
Saved: ABBA.txt (152299 bytes)
Saved: AC_DC.txt (184056 bytes)
Saved: AFI_(band).txt (61575 bytes)
Saved: Accept_(band).txt (47788 bytes)
Saved: Adam_Ant.txt (94230 bytes)
Saved: Aerosmith.txt (192561 bytes)
Saved: Air_Supply.txt (52266 bytes)
Saved: Alanis_Morissette.txt (122824 bytes)
Saved: Alice_Cooper.txt (181022 bytes)
Saved: Alice_Cooper_(band).txt (39506 bytes)
Saved: Alice_in_Chains.txt (205052 bytes)
Saved: AllMusic.txt (13197 bytes)
Saved: Alter_Bridge.txt (78955 bytes)
Saved: Ambrosia_(band).txt (30410 bytes)
Saved: America_(band).txt (52835 bytes)
Saved: Anthrax_(American_band).txt (93547 bytes)
Saved: April_Wine.txt (37365 bytes)
Saved: Arcade_Fire.txt (106135 bytes)
Saved: Arctic_Monkeys.txt (159436 bytes)
Saved: Asia_(band).txt (59659 bytes)
Sa

### *Step 3: Build Rock Network*

In [None]:
import re
import json
import pathlib
import networkx as nx

PAGES_DIR = pathlib.Path("pages_raw_wikitext")
ARTISTS_TXT = "rock_artists.txt"

# -------------------------
# Helpers
# -------------------------

# Capture [[Target]] and [[Target|display]], ignoring section anchors (after #)
LINK_RE = re.compile(r"\[\[([^\[\]\|#]+)(?:\|[^\[\]]+)?\]\]")

WORD_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ']+")

# NEW/CHANGED: normalization regexes
DASH_RE = re.compile(r"\s*[\u2010\u2011\u2012\u2013\u2014\u2015-]\s*")  # all common dash chars
APOST_RE = re.compile(r"[’‘`´]")  # curly/smart apostrophes & accents -> '
AND_TOKEN_RE = re.compile(r"\s+(?:&|and)\s+", flags=re.IGNORECASE)  # unify &/and during variants

def normalize_basic(s: str) -> str:
    """Normalize punctuation and spacing so variant lookups become reliable."""
    if not s:
        return s
    s = s.strip()
    s = re.sub(r"\s+", " ", s)   # collapse whitespace
    s = APOST_RE.sub("'", s)     # unify apostrophes
    s = DASH_RE.sub("-", s)      # unify dashes and trim spaces around them
    return s

def norm_key(s: str) -> str:
    """Key used for the variant index: normalized + lowercased."""
    return normalize_basic(s).lower() if s else s

def extract_links_from_wikitext(text: str):
    """Return list of raw wiki targets e.g. 'Black Sabbath' or 'AC/DC' (spaces preserved here)."""
    return [m.group(1).strip() for m in LINK_RE.finditer(text)]

def canon_title(title: str) -> str:
    """
    Canonical node label:
    - Normalize punctuation/spacing
    - Replace spaces with underscores
    - KEEP slashes (e.g., AC/DC stays AC/DC)
    """
    t = normalize_basic(title)  # NEW/CHANGED
    return t.replace("  ", " ").replace(" ", "_")

# NEW/CHANGED: helper to generate &/and forms
def amp_variants(s: str):
    base = normalize_basic(s)
    with_amp = AND_TOKEN_RE.sub(" & ", base)
    with_and = AND_TOKEN_RE.sub(" and ", base)
    return {base, with_amp, with_and}

def variants_for_lookup(title: str):
    """
    Variants to help map filenames/links to canonical performer titles.
    We include: spaces, underscores, slashes, &/and forms, and lowercase.
    (We still store them via norm_key, so case differences are collapsed.)
    """
    variants = set()
    for seed in amp_variants(title):
        # start from normalized seed
        seed = normalize_basic(seed)
        t_space = seed.replace("_", " ")
        t_under = seed.replace(" ", "_")
        # swap in slashes from both space and underscore forms
        t_slash_from_under = t_under.replace("_", "/")
        t_slash_from_space = t_space.replace(" ", "/")

        variants.update({
            seed, t_space, t_under, t_slash_from_under, t_slash_from_space,
            seed.lower(), t_space.lower(), t_under.lower(),
            t_slash_from_under.lower(), t_slash_from_space.lower(),
        })

        # also include minor hyphen/space variants (after dash normalization this is small)
        variants.add(seed.replace("-", " - "))
        variants.add(seed.replace(" - ", "-"))

    # final pass: normalize again so weird combos collapse
    return {normalize_basic(v) for v in variants}

def build_performer_index(performers_set):
    """
    Build a mapping from many variants -> canonical performer title (canonical uses underscores, keeps slashes).
    Uses norm_key() so lookups are consistent.
    """
    idx = {}
    for p in performers_set:
        for v in variants_for_lookup(p):
            idx[norm_key(v)] = p  # NEW/CHANGED
    return idx

def count_words(text: str) -> int:
    return len(WORD_RE.findall(text))

# -------------------------
# Load performers
# -------------------------
with open(ARTISTS_TXT, encoding="utf-8") as f:
    # Canonicalize with normalization: replace spaces with underscores; keep slashes in names like AC/DC
    performers = {canon_title(line) for line in f if line.strip()}  # canon_title() now normalizes

# Build a variant index for robust matching
index = build_performer_index(performers)

# -------------------------
# Build the directed graph
# -------------------------
G = nx.DiGraph()
for p in performers:
    # initialize nodes with a placeholder word count
    G.add_node(p, words=0, title=p.replace("_", " "))  # store a human-readable title as well

# Iterate saved pages (*.txt)
for txt_path in PAGES_DIR.glob("*.txt"):
    # 1) Determine the true wiki title for this file (prefer sidecar .json if exists)
    meta_path = txt_path.with_suffix(".txt.json")
    if meta_path.exists():
        meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
        final_title = meta.get("final_title") or txt_path.stem
    else:
        final_title = txt_path.stem

    # Canonicalize the source node label (normalize + spaces->underscores, keep slashes)
    src_canon = canon_title(final_title)

    # If not directly known, try variant mapping using filename stem as well
    if src_canon not in performers:
        # try a few variants from stem and final_title to find a performer
        candidates = set()
        for raw in {txt_path.stem, final_title, src_canon}:
            candidates |= variants_for_lookup(raw)
            candidates |= variants_for_lookup(canon_title(raw))
        # NEW/CHANGED: index lookup via norm_key
        src_mapped = next((index.get(norm_key(v)) for v in candidates if norm_key(v) in index), None)
        if not src_mapped:
            # couldn't map this file to a performer in your list
            continue
        src = src_mapped
    else:
        src = src_canon

    # 2) Read wikitext and set node word count
    text = txt_path.read_text(encoding="utf-8", errors="ignore")
    G.nodes[src]["words"] = count_words(text)

    # 3) Extract outgoing link targets from wikitext
    raw_links = extract_links_from_wikitext(text)

    # 4) Normalize and add edges only if target is another performer
    for tgt_raw in raw_links:
        # Prefer canonicalization path first
        tgt_canon_guess = canon_title(tgt_raw)  # canon_title() includes normalization now

        if tgt_canon_guess in performers:
            tgt = tgt_canon_guess
        else:
            # NEW/CHANGED: single consistent lookup key
            tgt = index.get(norm_key(tgt_raw))

        if tgt and tgt != src and tgt in performers:
            G.add_edge(src, tgt)

# -------------------------

# Prune isolates and keep largest weakly connected component
# -------------------------
isolates = list(nx.isolates(G))
print("Isolates:", isolates)
G.remove_nodes_from(isolates)

if G.number_of_nodes() > 0:
    lcc_nodes = max(nx.weakly_connected_components(G), key=len)
    G_lcc = G.subgraph(lcc_nodes).copy()
else:
    G_lcc = G

# -------------------------
# Inspect and save
# -------------------------
print("Nodes:", G_lcc.number_of_nodes())
print("Edges:", G_lcc.number_of_edges())

Isolates: ['Van_Zant_(band)', 'Dr._Hook_&_the_Medicine_Show', 'Jet_(Australian_band)']
Nodes: 486
Edges: 8378
AC/DC in-degree: 41
AC/DC out-degree: 34
Black_Sabbath in-degree: 74
Black_Sabbath out-degree: 64


### *Step 4: Generate GEXF, GraphML and json files*

In [6]:
import networkx as nx
from networkx.readwrite import json_graph
import json

# GEXF (good for Gephi, Cytoscape, and reloading into NetworkX)
nx.write_gexf(G_lcc, "rock_network.gexf")

# GraphML (also widely supported)
nx.write_graphml(G_lcc, "rock_network.graphml")

# Or JSON (node-link format)
with open("rock_network.json", "w", encoding="utf-8") as f:
    json.dump(json_graph.node_link_data(G_lcc), f, ensure_ascii=False)


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.
