In [1]:
import pandas as pd
import re

In [2]:
def normalize_tail(tail):
    if not isinstance(tail, str):
        return tail

    tail = tail.strip()

    # 1. Remove redlink rows
    if "redlink=1" in tail:
        return None

    # 2. Remove /w/index.php?title=
    if "/w/index.php?title=" in tail:
        tail = tail.replace("/w/index.php?title=", "")

    # 3. Remove action=edit etc.
    tail = re.sub(r"&.*", "", tail)

    # 4. If still not in /wiki/ format → convert
    if not tail.startswith("/wiki/"):
        tail = "/wiki/" + tail

    # Final clean: replace spaces with underscores
    tail = tail.replace(" ", "_")
    return tail



In [3]:
def clean_edges_and_expand_nodes(edges_path, nodes_path, out_edges, out_nodes):
    print("Loading...")
    edges = pd.read_csv(edges_path)
    nodes = pd.read_csv(nodes_path)

    # Convert column names consistently
    edges.columns = ["head", "relation", "tail"]

    print("Cleaning tail...")
    new_tails = []

    cleaned_rows = []
    for _, row in edges.iterrows():
        head = row["head"]
        rel = row["relation"]
        tail = row["tail"]

        norm_tail = normalize_tail(tail)

        # If redlink removed
        if norm_tail is None:
            continue

        cleaned_rows.append([head, rel, norm_tail])
        new_tails.append(norm_tail)

    edges_cleaned = pd.DataFrame(cleaned_rows, columns=["head", "relation", "tail"])

    print("Generating new nodes from tail...")
    old_nodes = set(nodes["name"].tolist())
    new_nodes = []

    for tail in new_tails:
        if tail not in old_nodes:
            new_nodes.append({"name": tail, "label": "UNKNOWN"})
            old_nodes.add(tail)

    nodes_extended = pd.concat([nodes, pd.DataFrame(new_nodes)], ignore_index=True)

    edges_cleaned.to_csv(out_edges, index=False, encoding="utf8")
    nodes_extended.to_csv(out_nodes, index=False, encoding="utf8")

    print("DONE")
    print(f"edges_cleaned → {out_edges}")
    print(f"nodes_extended → {out_nodes}")
    print(f"Added {len(new_nodes)} new nodes")


In [4]:
clean_edges_and_expand_nodes(
    edges_path=r"C:\Users\ADMIN\Downloads\professors\data\cleaned\edges.csv",
    nodes_path=r"C:\Users\ADMIN\Downloads\professors\data\final\nodes.csv",
    out_edges=r"C:\Users\ADMIN\Downloads\professors\data\final\edges_cleaned.csv",
    out_nodes=r"C:\Users\ADMIN\Downloads\professors\data\final\nodes_extended.csv"
)

Loading...
Cleaning tail...
Generating new nodes from tail...
DONE
edges_cleaned → C:\Users\ADMIN\Downloads\professors\data\final\edges_cleaned.csv
nodes_extended → C:\Users\ADMIN\Downloads\professors\data\final\nodes_extended.csv
Added 2058 new nodes
