In [44]:
import requests
import difflib
import csv
import re
from collections import defaultdict

#parameters

URL = "https://en.wikipedia.org/w/api.php"

HEADERS = {
    "User-Agent": "WikiEditStudy/1.0 (student project)"
}

PARAMS = {
    "action": "query",
    "format": "json",
    "prop": "revisions",
    "titles": "Legal status of transgender people",
    "rvprop": "content",
    "rvslots": "main",
    "rvstart": "2023-12-31T23:59:59Z",
    "rvend": "2022-01-01T00:00:00Z",
    "rvlimit": "max",
    "rvdir": "older"
}


Saved wiki_word_summary.csv


In [None]:
# tokenizer

def tokenize(text):
    """Lowercase and extract alphabetic words."""
    return re.findall(r"\b[a-zA-Z]+\b", text.lower())

In [None]:
#edits finder

def get_revisions():
    revisions = []
    cont = {}

    while True:
        params = PARAMS.copy()
        params.update(cont)

        r = requests.get(URL, params=params, headers=HEADERS)
        data = r.json()

        pages = data["query"]["pages"]
        page = next(iter(pages.values()))

        for rev in page["revisions"]:
            slot = rev["slots"]["main"]
            text = slot.get("*", slot.get("content", ""))
            if text:
                revisions.append(text)

        if "continue" in data:
            cont = data["continue"]
        else:
            break

    return revisions

In [None]:
def main():
    revisions = get_revisions()

    added_counts = defaultdict(int)
    removed_counts = defaultdict(int)

    for i in range(len(revisions) - 1):
        old_words = tokenize(revisions[i])
        new_words = tokenize(revisions[i + 1])

        diff = difflib.ndiff(old_words, new_words)

        for d in diff:
            word = d[2:]
            if d.startswith("+ "):
                added_counts[word] += 1
            elif d.startswith("- "):
                removed_counts[word] += 1

    # Merge counts and compute net
    all_words = set(list(added_counts.keys()) + list(removed_counts.keys()))
    results = []

    for word in all_words:
        added = added_counts.get(word, 0)
        removed = removed_counts.get(word, 0)
        net = added - removed
        results.append({
            "word": word,
            "times_added": added,
            "times_removed": removed,
            "net_change": net
        })

    # Sort by times_added descending
    results.sort(key=lambda x: x["times_added"], reverse=True)

    # Write CSV
    with open("wiki_word_summary.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["word", "times_added", "times_removed", "net_change"])
        writer.writeheader()
        writer.writerows(results)

    print("Saved wiki_word_summary.csv")


if __name__ == "__main__":
    main()
