# Exploring Wikipedia content moderation

The goal is to develop a blocklist of NSFW articles on English Wikipedia that we can apply to the Dynamic Wikipedia search index.
This notebook explores some of the Wikipedia resources available that we can use to accomplish this.

In [1]:
import json
from pprint import pprint
import random
import gzip
import codecs
import re
from time import sleep
from collections import defaultdict
from io import StringIO
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
import requests
# import plotnine as gg
from tqdm import tqdm

pd.set_option("display.max_columns", None)
pd.set_option("display.show_dimensions", True)

tqdm.pandas()

_ = gg.theme_set(gg.theme_light())
# Better quality plots
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")


def display_pd(x, rows=None, cols=None, colwidth=None, seqitems=None, full=True):
    """Display pandas object with different display settings."""
    opts = []
    if full:
        rows = rows or -1
        cols = cols or -1
        colwidth = colwidth or -1
        seqitems = seqitems or -1
    if rows:
        rows = None if rows < 0 else rows
        opts.append("display.max_rows")
        opts.append(rows)
    if cols:
        cols = None if cols < 0 else cols
        opts.append("display.max_columns")
        opts.append(cols)
    if colwidth:
        colwidth = None if colwidth < 0 else colwidth
        if colwidth is None:
            # Adjust max seq items as well to make display unlimited
            seqitems = -1
        opts.append("display.max_colwidth")
        opts.append(colwidth)
    if seqitems:
        seqitems = None if seqitems < 0 else seqitems
        opts.append("display.max_seq_items")
        opts.append(seqitems)
    if opts:
        with pd.option_context(*opts):
            display(x)
    else:
        display(x)

We will make use of some assets availabe in the Wikimedia dumps. These should be downloaded ahead of time.

- The full ElasticSearch index for English Wikipedia:
    * <https://dumps.wikimedia.org/other/cirrussearch/20230123/enwiki-20230130-cirrussearch-content.json.gz>
    * __Note: this is a huge file, 35GB compressed__
- The list of English Wikipedia categories and their linkages in RDF (Turtle) format:
    * <https://dumps.wikimedia.org/other/categoriesrdf/20230128/enwiki-20230128-categories.ttl.gz>
    * 82 MB
    * Uses ontology defined at <https://www.mediawiki.org/ontology/ontology.owl>

In [2]:
ASSETS_DIR = Path("/Users/dzeber/data")
OUTPUT_DIR = Path("data")

FULL_INDEX = ASSETS_DIR / "enwiki-20230130-cirrussearch-content.json.gz"
CAT_FILE = ASSETS_DIR / "enwiki-20230128-categories.ttl.gz"

## Reading through the full index

The full index file is gzipped 35 GB.
In the ElasticSearch index format, entries are composed of two lines
```
{"index": {...}}
{field1: val1, ...}
```
The page information we are interested in is on the second line of each entry.


First try reading through it to get an idea of timing.
- It took 7 min 30 sec to run through the full file, with 13.2 M lines.
- There are 6.6 M articles in this index.

In [795]:
with gzip.open(FULL_INDEX, "rt") as f:
    for _ in range(4):
        print(f.readline()[:100])

{"index":{"_type":"_doc","_id":"1000"}}

{"template":["Template:Short description","Template:Pagetype","Template:Main other","Template:Short 
{"index":{"_type":"_doc","_id":"10000"}}

{"template":["Template:Wiktionary","Template:Sister project","Template:Side box","Template:Plainlist


In [None]:
%%time

nl = 0
with gzip.open(FULL_INDEX, "rt") as f:
    for line in f:
        nl += 1

In [None]:
nl

In [585]:
class IndexStream:
    """Stream the full ElasticSearch index and process each record."""
    def _apply_full_index(self, max_records=None):
        """Read through the full index line by line and apply processing to each article listing.
    
        Index lines are skipped. Optionally stop after `max_records` article records.
        """
        # If processing the whole file, use a ballpark for total lines
        # so as to get reasonable progress monitoring.
        progress_total = 2 * max_records if max_records else 13_250_000
        with gzip.open(FULL_INDEX, "rt") as fr:
            for i, line in tqdm(enumerate(fr), total=progress_total):
                if max_records and i >= max_records * 2:
                    break
                # Skip index lines
                if i % 2 == 0:
                    continue
                self._process_record(line, i)
    
    def _process_record(self, line, i):
        # What to do for each line (JSON record)
        # Call self._write_to_output(line) here if using output file
        raise NotImplementedError
    
    def _write_to_output(self, line):
        self.n_kept += 1
        self.fw.write(line)
    
    def run_with_output_file(self, output_file, max_records=None):
        self.n_kept = 0
        self.fw = gzip.open(output_file, "wt")
        try:
            self._apply_full_index(max_records=max_records)
        finally:
            self.fw.close()

In [581]:
# class TestIndexStream(IndexStream):
#     def _process_record(self, line, i):
#         self._write_to_output(line)

# tis = TestIndexStream()
# tis.run_with_output_file(OUTPUT_DIR / "test.json.gz", max_records=100)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 2858.12it/s]


## Category listing

The majority of Wikipedia pages belong to 1 or more categories. Wikipedia categories are a complex taxonomy of labels used to organize content by topic and also to flag page characteristcs, eg. those in need of maintenance.

- Pages are assigned categories by editors adding the category labels in the page source.
- Each category may contain pages or other subcategories. These can be viewed on the category's page, eg. <https://en.wikipedia.org/wiki/Category:Coffee>.
- Parent/child category relationships define a directed graph. As categories are assigned to pages by manual labeling, this graph can't be assumed to be acyclic.
- Some categories are defined to be "hidden" - these are generally related to maintenance (eg. "pages with missing information") rather than topic. Visible categories that a page belongs to are listed at the bottom of the page. Both hidden and visible categories are included in the search index (and are not distinguished).

Among the Wikipedia dumps is an RDF-formatted list of all categories. This includes counts of pages and subcategories, an indicator for hidden categories, and a listing of categories containing each one as a subcategory (ie a list of parents).

In [30]:
ALL_CAT_JSON = OUTPUT_DIR / "categories.json.gz"
CAT_LINKS_JSON = OUTPUT_DIR / "category_links.json.gz"

CAT_DF_PKL = OUTPUT_DIR / "category_data.pkl"

The RDF file is 82 MB compressed and contains 21.5M lines.

In [811]:
num_cat_lines = ! zgrep -c '^' $CAT_FILE
num_cat_lines = int(num_cat_lines[0])
print(f"{num_cat_lines:,}")

21,451,551


It contains two types of records with the properties defined in [this ontology](view-source:https://www.mediawiki.org/ontology/ontology.owl):
- category definitions with label (human-readable name) and counts of pages & subcategories

```
<https://en.wikipedia.org/wiki/Category:Coffee_preparation> a mediawiki:Category ;
    rdfs:label "Coffee preparation" ;
    mediawiki:pages "54"^^xsd:integer ;
    mediawiki:subcategories "3"^^xsd:integer .
```

- category linkages with a list of parents

```
<https://en.wikipedia.org/wiki/Category:Coffee_preparation> mediawiki:isInCategory <https://en.wikipedia.org/wiki/Category:Coffee>,
    <https://en.wikipedia.org/wiki/Category:Commons_category_link_is_on_Wikidata>,
    <https://en.wikipedia.org/wiki/Category:Food_and_drink_preparation> .
```

We use some light parsing to convert the dump to a more manageable pandas-friendly format. Because of the large size, this is more manageable than parsing as a graph using RDF libraries. Records are written to separate DFs:
- category list with page URI (canonical form), name (plain text), # pages, # subcategories, whether it is hidden
- linkage list with page URI, list of page URIs for parent categories

In [7]:
def parse_cat_index(cat_index_rdf, cat_list_json, cat_link_json):
    def parse_key(k):
        # Pull out canonical page URIs from full URLs
        # eg. '<https://en.wikipedia.org/wiki/Category:Songs>' -> 'Songs'
        return k.strip("<>").split("/Category:")[-1]

    def handle_entry(x):
        row = {}
        x = x.rstrip(" .")
        key, info = x.split(maxsplit=1)
        row["key"] = parse_key(key)
        if info.startswith("a mediawiki:Category"):
            # category property listing
            row["hidden"] = "HiddenCategory" in info
            props = info.split(" ;")
            for p in props:
                if p.startswith("rdfs:label"):
                    row["name"] = p[12:-1]
                elif p.startswith("mediawiki:pages"):
                    row["num_pages"] = int(p[17:-14])
                elif p.startswith("mediawiki:subcategories"):
                    row["num_subcats"] = int(p[25:-14])
        elif info.startswith("mediawiki:isInCategory"):
            # category membership listing
            info = info.split(maxsplit=1)[-1]
            links = info.split(">,<")
            row["memberof"] = [parse_key(x) for x in links]
        else:
            row = None

        return row

    with (
        gzip.open(cat_list_json, "wt") as fwc,
        gzip.open(cat_link_json, "wt") as fwcl,
        gzip.open(cat_index_rdf, "rt") as fr
    ):
        entry = StringIO()
        for line in tqdm(fr, total=21_500_000):
            entry.write(line.strip())
            # RDF entries span multiple lines and terminate with '.'
            if entry.getvalue().endswith("."):
                row = handle_entry(entry.getvalue())
                if row:
                    rowstr = json.dumps(row) + "\n"
                    if "memberof" in row:
                        fwcl.write(rowstr)
                    else:
                        fwc.write(rowstr)
                entry.close()
                entry = StringIO()
        entry.close()  

Takes ~1 min.

In [8]:
%%time

parse_cat_index(cat_index_rdf=CAT_FILE, cat_list_json=ALL_CAT_JSON, cat_link_json=CAT_LINKS_JSON)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████▊| 21451551/21500000 [00:49<00:00, 431383.02it/s]

CPU times: user 48.7 s, sys: 646 ms, total: 49.4 s
Wall time: 49.7 s





### Category list

In [9]:
df_cat_info = pd.read_json(ALL_CAT_JSON, lines=True).set_index("key")

In [10]:
df_cat_info.sample(5)

Unnamed: 0_level_0,hidden,name,num_pages,num_subcats
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Vanuatu_sidebar_templates,False,Vanuatu sidebar templates,2,0
Military_of_Baoding,False,Military of Baoding,1,0
1796_by_continent,False,1796 by continent,0,9
1979_establishments_in_the_Balearic_Islands,False,1979 establishments in the Balearic Islands,1,0
MacRobertson_Miller_Airlines,False,MacRobertson Miller Airlines,1,1


Listed categories should be unique.

In [11]:
assert df_cat_info.index.value_counts().unique() == 1
assert df_cat_info["name"].value_counts().unique() == 1

Total number of categories:

In [12]:
print(f"{len(df_cat_info):,}")

2,229,569


Hidden categories:

In [13]:
df_cat_info["hidden"].value_counts()

False    2196512
True       33057
Name: hidden, Length: 2, dtype: int64

Top visible & hidden categories:

In [14]:
df_cat_info.query("~hidden").sort_values("num_pages", ascending=False).head()

Unnamed: 0_level_0,hidden,name,num_pages,num_subcats
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WikiProject_Biography_articles,False,WikiProject Biography articles,1931437,9
Biography_articles_of_living_people,False,Biography articles of living people,1105332,1
Living_people,False,Living people,1056003,2
Stub-Class_biography_articles,False,Stub-Class biography articles,1035629,10
Start-Class_biography_articles,False,Start-Class biography articles,681653,10


In [15]:
df_cat_info.query("hidden").sort_values("num_pages", ascending=False).head()

Unnamed: 0_level_0,hidden,name,num_pages,num_subcats
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Articles_with_short_description,True,Articles with short description,4788961,4
Short_description_is_different_from_Wikidata,True,Short description is different from Wikidata,3386770,0
All_stub_articles,True,All stub articles,2340146,0
Noindexed_pages,True,Noindexed pages,2291847,51390
Redirects_from_moves,True,Redirects from moves,2137842,2


There are some empty categories included in the list.

In [16]:
df_cat_info.query("num_pages + num_subcats == 0")["hidden"].value_counts()

False    122586
True       3095
Name: hidden, Length: 2, dtype: int64

### Category linkage

This gives a list of parent keys for each category.

In [17]:
df_cat_links = pd.read_json(CAT_LINKS_JSON, lines=True).set_index("key")

In [18]:
display_pd(df_cat_links.sample(5))

Unnamed: 0_level_0,memberof
key,Unnamed: 1_level_1
Heinrich_C._Berann_National_Park_Service_panoramas_(featured_picture_set),"[Featured_picture_sets, Featured_picture_sets_of_the_United_States, Featured_pictures_of_Austria]"
Railway_tunnels_in_Europe_by_country,"[Container_categories, Rail_infrastructure_in_Europe_by_country, Railway_tunnels_in_Europe, Tunnels_in_Europe_by_country]"
Macedonian_female_canoeists,"[Female_canoeists, Macedonian_canoeists, Macedonian_sportswomen]"
FM-Class_Washington_articles,"[CatAutoTOC_generates_no_TOC, FM-Class_articles, Template_Category_class_with_class_parameter_matching_title, Washington_articles_by_quality]"
WikiProject_Tonga_templates,"[Template:Template_category_with_no_topic_or_description, Tonga_templates, WikiProject_Tonga, WikiProject_templates, Wikipedia_template_categories]"


A few categories are missing here. We assume these not have parents (ie. they are roots).

In [19]:
print(f"{len(df_cat_links):,}")

2,229,448


### Combined dataset

Join together the two datasets, and add in lists of visible parents and subcategories. In the process a few unknown parent categories are found. These are ignored.

In [20]:
def _fill_missing_list(x):
    if not isinstance(x, list):
        return []
    return x


def get_combined_cat_df(df_info, df_links):
    df_combined = pd.merge(df_info, df_links, left_index=True, right_index=True, how="outer")
    df_combined = df_combined.rename(columns={"memberof": "parents"})
    # `parents` column should contain lists. Fill missing values.
    df_combined["parents"] = df_combined["parents"].map(_fill_missing_list)
    
    # For each category, find the subsets of parents and children which are visible.
    # These are appended to df_combined.
    parents_flat = pd.merge(
        df_combined[["name", "hidden"]],
        df_combined["parents"].explode(),
        left_index=True,
        right_index=True
    )

    # Ignore categories with no parents
    parents_flat = parents_flat.query("parents.notna()")

    # Look up display name & hidden indicator for parents
    n_parents_before = len(parents_flat)
    parents_flat = pd.merge(
        parents_flat,
        df_combined[["hidden", "name"]],
        left_on="parents",
        right_index=True,
        suffixes=("_cat", "_parent"),
        how="inner"
    )
    n_parents_after = len(parents_flat)
    # There are a few parents not found in the main category listing.
    # These are dropped silently by the use of the inner join.
    print(f"Unknown parents: {n_parents_before - n_parents_after:,}")

    # Pull out lists of visible parents.
    # These lists contain display names, not keys.
    visible_parents = (
        parents_flat
        .query("~hidden_parent")
        .groupby("key")
        .agg({"name_parent": lambda s: s.to_list()})
    )
    # Pull out lists of visible subcategories.
    # These lists contain display names, not keys.
    visible_subcats = (
        parents_flat
        .query("~hidden_cat")
        # `parents` contains keys
        .groupby("parents")
        .agg({"name_cat": lambda s: s.to_list()})
    )

    df_combined["parents_visible"] = visible_parents["name_parent"]
    df_combined["subcats_visible"] = visible_subcats["name_cat"]
    # After left-joining in these columns, there may be some missing values introduced.
    # Replace with empty lists.
    df_combined["parents_visible"] = df_combined["parents_visible"].map(_fill_missing_list)
    df_combined["subcats_visible"] = df_combined["subcats_visible"].map(_fill_missing_list)

    return df_combined

Takes ~1 min.

In [21]:
%%time

df_cats = get_combined_cat_df(df_cat_info, df_cat_links)

Unknown parents: 30
CPU times: user 44.2 s, sys: 2.3 s, total: 46.5 s
Wall time: 47.3 s


The combined dataset should contain exactly the categories in the flat listing.

In [22]:
assert len(df_cats) == len(df_cat_info)
assert df_cats.isna().sum().sum() == 0

In [25]:
display_pd(df_cats.sample(5))

Unnamed: 0_level_0,hidden,name,num_pages,num_subcats,parents,parents_visible,subcats_visible
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Roman_legionary_fortresses_in_Spain,False,Roman legionary fortresses in Spain,3,0,"[Roman_fortifications_in_Spain, Roman_legionary_fortresses_by_country]","[Roman legionary fortresses by country, Roman fortifications in Spain]",[]
1985_NCAA_Missouri_Valley_Conference_football_season,False,1985 NCAA Missouri Valley Conference football season,0,0,[Wikipedia_soft_redirected_categories],[],[]
1973_disestablishments_in_North_Carolina,False,1973 disestablishments in North Carolina,5,0,"[1970s_disestablishments_in_North_Carolina, 1973_disestablishments_in_the_United_States, 1973_in_North_Carolina, CatAutoTOC_generates_no_TOC, Disestablishments_in_North_Carolina_by_year, Navseasoncats_year_and_decade]","[Disestablishments in North Carolina by year, 1970s disestablishments in North Carolina, 1973 disestablishments in the United States, 1973 in North Carolina]",[]
2007%E2%80%9308_in_Oceanian_basketball,False,2007–08 in Oceanian basketball,2,3,"[2007_in_Oceanian_sport, 2007_in_basketball, 2008_in_Oceanian_sport, 2008_in_basketball, Seasons_in_Oceanian_basketball]","[Seasons in Oceanian basketball, 2007 in basketball, 2007 in Oceanian sport, 2008 in basketball, 2008 in Oceanian sport]","[2007–08 in Australian basketball, 2007 in New Zealand basketball, 2008 in New Zealand basketball]"
1985_in_Sudan,False,1985 in Sudan,3,1,"[1980s_in_Sudan, 1985_by_country, 1985_in_Africa, CatAutoTOC_generates_no_TOC, Navseasoncats_year_and_decade, Years_of_the_20th_century_in_Sudan]","[Years of the 20th century in Sudan, 1980s in Sudan, 1985 by country, 1985 in Africa]",[1985 establishments in Sudan]


The number of subcategories listed in the main dataset matches the linked subcategories almost always for visible categories.

In [26]:
len(df_cats[df_cats["subcats_visible"].map(len) != df_cats["num_subcats"]].query("~hidden"))

4268

How many visible categories have no visible parents or subcategories?

In [27]:
print(f"No visible parents: {(df_cats.query('~hidden')['parents_visible'].map(len) == 0).sum():,}")
print(f"No visible subcategories: {(df_cats.query('~hidden')['subcats_visible'].map(len) == 0).sum():,}")

No visible parents: 109,118
No visible subcategories: 1,196,459


There are some singleton categories with no nonhidden linkage.
Looking at their (hidden) parents suggests the majority of these are redirects.

In [29]:
(
    df_cats[
        (df_cats["parents_visible"].map(len) == 0)
        & (df_cats["subcats_visible"].map(len) == 0)
    ]
    .query("~hidden")
    ["parents"].explode()
    .value_counts()
)

Wikipedia_soft_redirected_categories                                103577
Disambiguation_categories                                             2299
CatAutoTOC_generates_no_TOC                                            252
Commons_category_link_is_on_Wikidata                                   209
All_redirect_categories                                                130
                                                                     ...  
Wikipedia_categories_named_after_American_artists                        1
Wikipedia_categories_named_after_political_parties_in_Bangladesh         1
Wikipedia_categories_named_after_Swiss_people                            1
Wikipedia_categories_named_after_American_football_people                1
Wikipedia_categories_named_after_translators                             1
Name: parents, Length: 327, dtype: int64

Save to pickle (~350 MB).

In [31]:
df_cats.to_pickle(CAT_DF_PKL)

### Searching categories

We build tools for searching the full list of categories that will be used later on for building lists of related categories and subcategories.

This will search through the full list of categories for specific entries or those matching a regex.
It will also indicate which results are themselves subcategories of another result, for summarization purposes.
Results which are _not_ subcategories of another result are the highest-level categories on the topic of interest.

In [471]:
def find_matching_categories(regex=None, catlist=None, nonempty=False, cat_df=df_cats, return_parents=False):
    """Find a list of visible categories matching a list or regex.
    
    regex: a regex to match category names
    catlist: a list of exact category names
    nonempty: should empty categories (no pages or subcategories) be removed?
    cat_df: the combined category table
    return_parents: should the exploded table of parents also be returned?
    
    Returns a subset of the combined category table corresponding to the search terms
    with additional columns `parent_matches` indicating whether each category has a parent
    that is also in the result, and `seed` giving the portion of the category name that was matched.
    
    If `return_parents` is `True`, returns that and the exploded table of parents that was joined in.
    """
    # First find the matching rows in the category info table
    cat_df_visible = cat_df.query("~hidden")
    search_results = []
    if catlist is not None:
        search_results.append(
            cat_df_visible[cat_df_visible["name"].isin(catlist)]
            .assign(seed=lambda d: d["name"])
        )
    if regex is not None:
        re_matches = cat_df_visible["name"].str.extract(f"({regex})", flags=re.IGNORECASE)[0]
        search_results.append(cat_df_visible[re_matches.notna()].assign(seed=re_matches))
    matching_cats = pd.concat(search_results, ignore_index=False)
    matching_cats = matching_cats[~matching_cats.index.duplicated()]
    
    # A number of category labels have no member pages or subcategories
    # Prune empty categories if requested
    if nonempty:
        matching_cats = matching_cats.query("num_pages + num_subcats > 0")
    
    # For each category in the list, check whether one of its parents is also in the list
    matching_cats_parents = (
        matching_cats["parents_visible"].explode().reset_index()
        .merge(matching_cats[["name"]], left_on="parents_visible", right_on="name", how="left")
        .assign(parent_matches=lambda d: d["name"].notna())
    )
    has_matching_parent = (
        matching_cats_parents
        .groupby("key")
        .agg({"parent_matches": "any"})
    )
    matching_cats = matching_cats.join(has_matching_parent)
    
    if return_parents:
        return matching_cats, matching_cats_parents
    return matching_cats

In [None]:
find_matching_categories("^coffee\\b", ["Alcoholic coffee drinks"]).query("~parent_matches")

This starts from a set of seed categories and searches through all successive levels of subcategories.

In [534]:
def category_bfs(seed_cats=None, seed_re=None, exclude_cats=None, exclude_re=None, max_level=None, cat_df=df_cats, top_level_seeds=True):
    """Build a category list from seeds and walk the graph of category->subcategory links.
    
    seed_cats: a list of exact category names
    seed_re: a regex to match category names
    exclude_cats: a list of exact category names to exclude
    exclude_re: a regex to exclude
    max_level: if supplied, stop after this level
    cat_df: the combined category table
    top_level_seeds: if `True`, do not include seed categories which are subcategories of
        another seed category.
    
    Returns a DF with columns:
    - name: the category name discovered
    - seed: the original seed which generated the top-level category containing this one
    - parent: the immediate parent that led to this category
    - level: the number of steps from the seed category to this one
    """
    seed_results = find_matching_categories(regex=seed_re, catlist=seed_cats, cat_df=cat_df)
    if top_level_seeds:
        seed_results = seed_results.query("~parent_matches")
    curr_cat_links = seed_results

    # Apply exclusions
    if exclude_cats:
        curr_cat_links = curr_cat_links[~curr_cat_links["name"].isin(exclude_cats)]
    if exclude_re:
        with warnings.catch_warnings():
            # Ignore warning about matching groups.
            warnings.simplefilter("ignore", UserWarning)
            curr_cat_links = curr_cat_links[~curr_cat_links["name"].str.contains(exclude_re, case=False)]
    bl_rows = curr_cat_links[["name", "seed"]].assign(parent=None, level=0).reset_index(drop=True)
    i = 0

    while len(curr_cat_links) > 0:
        i += 1
        # display(bl_rows)
        if max_level and i > max_level:
            break
        print(f"Level: {i}", end="\r")
        # Next level of categories are visible subcategories of current list
        new_cats = (
            curr_cat_links["subcats_visible"].explode().to_frame()
            .join(curr_cat_links[["name", "seed"]])
            .reset_index(drop=True)
            .query("subcats_visible.notna()")
        )
        # Apply exclusions
        if exclude_cats:
            new_cats = new_cats[~new_cats["subcats_visible"].isin(exclude_cats)]
        if exclude_re:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", UserWarning)
                new_cats = new_cats[~new_cats["subcats_visible"].str.contains(exclude_re, case=False)]
        
        # Drop previously seen categories.
        bl_new = new_cats.rename(columns={"subcats_visible": "name", "name": "parent"}).assign(level=i)
        bl_rows = pd.concat([bl_rows, bl_new], ignore_index=True).drop_duplicates(subset="name")
        curr_cat_list = bl_rows.query(f"level == {i}")[["name", "seed"]]
        curr_cat_links = curr_cat_list.merge(cat_df, on="name", how="left")
    
    return bl_rows

In [None]:
category_bfs(
    ["Coffee with milk politics"],
    "^coffee\\b",
    exclude_re="county|house press|stain studio|templates",
)

## Download Wikipedia pages to build filters from

Wikipedia offers a REST API which can be used to programmatically query various types of page content, such as all subcategories and pages belonging to a category, or the content of a page.
Set up some tooling to query the API.

In [132]:
class WikiQuery:
    """Query Wikipedia's API."""
    # Documentation: https://www.mediawiki.org/wiki/API:Main_page
    WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"
    should_continue = True
    
    def _query(self, params):
        """Issue a query specified by the parameters to the API.
        
        `self._handle_response_page()` is called on the JSON result.
        If the result set spans multiple pages, this iterates through the pages
        and calls `self._handle_response_page()` on each page.
        """
        while True:
            rj = requests.get(self.WIKIPEDIA_API_URL, params=params).json()
            self._handle_response_page(rj, params)
            
            if self.should_continue and ("continue" in rj):
                params.update(rj["continue"])
                # Simple rate limiting
                sleep(0.5)
            else:
                break
    
    def _handle_response_page(self, response_json, params):
        """Action to take for each page of JSON results.
        
        Eg. accumulate results into a collection member variable.
        """
        pass
    

class CatQuery(WikiQuery):
    """Query Wikipedia's API for category members, including subcategories."""
    # Documentation: https://www.mediawiki.org/wiki/API:Categorymembers
    BASE_PARAMS = {
        "action": "query",
        "list": "categorymembers",
        "cmlimit": 500,
        "format": "json",
        "cmprop": "title|ids|type",
    }
    
    def __init__(self):
        # Mapping of pageid to page info dict
        self.pages = {}
        self.categories = {}
        # List of remaining category names to query
        self.subcat_queue = []
        
    def _handle_response_page(self, response_json, params):
        for x in response_json["query"]["categorymembers"]:
            i = x["pageid"]
            if (x["type"] == "subcat") and (i not in self.categories):
                # New subcategory. Add to list of known cats and query queue
                self.categories[i] = x
                self.subcat_queue.append(x["title"])
            elif i in self.pages:
                # Previously seen page, also belongs to subcategory.
                # Record subcategory.
                self.pages[i]["categories"].append(params["cmtitle"])
            else:
                # New page
                x["categories"] = [params["cmtitle"]]
                self.pages[i] = x
    
    def get_members(self, category):
        """Get a list of pages included in a category and its subcategories."""
        self.categories[-1] = {"title": category}
        self.subcat_queue.append(category)
        
        while len(self.subcat_queue) > 0:
            cat = self.subcat_queue.pop(0)
            params = dict(self.BASE_PARAMS)
            params["cmtitle"] = cat
            
            # print("cat", params)
            self._query(params)


class TemplateQuery(WikiQuery):
    """Query Wikipedia's API for pages containing a template."""
    # Documentation: https://www.mediawiki.org/wiki/API:Transcludedin
    BASE_PARAMS = {
        "action": "query",
        "prop": "transcludedin",
        "tilimit": 500,
        "format": "json"
    }
    
    def __init__(self):
        # Mapping of pageid to page info dict
        self.pages = []

    def _handle_response_page(self, response_json, params):
        _, results = response_json["query"]["pages"].popitem()
        self.pages.extend(results["transcludedin"])
    
    def get_pages(self, template_title):
        """Get a list of pages using the given template."""
        params = dict(self.BASE_PARAMS)
        params["titles"] = template_title

        self._query(params)


class LinksParse(WikiQuery):
    """Query Wikipedia's API for links on a page."""
    # Documentation: https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
    BASE_PARAMS = {
        "action": "parse",
        "prop": "links",
        "format": "json"
    }
    
    def __init__(self):
        # List of {"page": ..., "ns": ...}
        self.links = []
    
    def _handle_response_page(self, response_json, params):
        r = response_json["parse"]["links"]
        self.links = [{"page": x["*"], "ns": x["ns"]} for x in r]
    
    def get_page(self, page):
        params = dict(self.BASE_PARAMS)
        params["page"] = page

        self._query(params)


class Namespaces(WikiQuery):
    """Query Wikipedia's API for namespace names & IDs."""
    # Documentation: https://www.mediawiki.org/wiki/API:Siteinfo
    BASE_PARAMS = {
        "action": "query",
        "meta": "siteinfo",
        "siprop": "namespaces",
        "format": "json",
        "formatversion": "2",
    }
    
    def __init__(self):
        # Mapping of <id>: <name>
        self.ns = {}
    
    def _handle_response_page(self, response_json, params):
        r = response_json["query"]["namespaces"]
        self.ns = {int(k): v["name"] for k, v in r.items()}
    
    def get_ns(self):
        params = dict(self.BASE_PARAMS)
        self._query(params)
        # Add label for main pages.
        self.ns[0] = "Article"

### Namespaces

Pull the mapping of [namespace](https://en.wikipedia.org/wiki/Wikipedia:Namespace) ID to namespace name.

In [121]:
nslist = Namespaces()
nslist.get_ns()

In [123]:
NAMESPACES = nslist.ns

### Pull the list of bad images

MediaWiki hosts a [list of offensive images](https://en.wikipedia.org/wiki/MediaWiki:Bad_image_list) that are used across the project. We will use them for filtering Wikipedia pages.

Download the list of image links using the API.
This gives a list of filenames of the form `"File:<filename>"`, which we store as a JSON list.

In [62]:
BAD_IMG_JSON = OUTPUT_DIR / "bad_image_list.json"

In [35]:
%%time

lp = LinksParse()
lp.get_page("MediaWiki:Bad_image_list")

CPU times: user 30.1 ms, sys: 12.3 ms, total: 42.3 ms
Wall time: 263 ms


In [38]:
badimg = [x["page"] for x in lp.links if x["ns"] == 6]

In [63]:
len(badimg)

936

In [64]:
with open(BAD_IMG_JSON, "w") as f:
    json.dump(badimg, f)

In [None]:
# with open(BAD_IMG_JSON) as f:
#     badimg = json.load(f)

### List of pages & subcategories related to controversial topics

The Wikipedia category
[Wikipedia controversial topics](https://en.wikipedia.org/wiki/Category:Wikipedia_controversial_topics)
contains a categorization of pages that are controversial.
These include:

- [__Controversial topics__](https://en.wikipedia.org/wiki/Wikipedia:List_of_controversial_issues): topics which are disputed, see a lot of circular editing, or are subject to bias.
    * identified by templates like [Template:Controversial](https://en.wikipedia.org/wiki/Template:Controversial) on their Talk page
    * listed in [Category:Wikipedia controversial topics](https://en.wikipedia.org/wiki/Category:Wikipedia_controversial_topics).
- [__Contentious topics__](https://en.wikipedia.org/wiki/Wikipedia:Contentious_topics): specially-designated topics that have attracted persistent disruptive editing. Administrators are allowed to impose additional editing restrictions on these pages.
    * identified by templates or editnotices such as those belonging to [Category:Standardised Wikipedia arbitration enforcement templates](https://en.wikipedia.org/wiki/Category:Standardised_Wikipedia_arbitration_enforcement_templates)
    * listed in [Category:Wikipedia pages about contentious topics](https://en.wikipedia.org/wiki/Category:Wikipedia_pages_about_contentious_topics), a subcategory of Controversial topics.
- __Objectionable content__: content that may be graphically sexual or otherwise objectionable.
    * identified by the use of templates like [Template:Censor](https://en.wikipedia.org/wiki/Template:Censor) on their Talk page
    * listed in [Category:Wikipedia objectionable content](https://en.wikipedia.org/wiki/Category:Wikipedia_objectionable_content), a subcategory of Controversial topics.

These pages contain examples of content that we may wish to block or downweight.

We use the API to pull a full list of pages included in here or any subcategory. For many of these pages, it is the Talk page that belongs to the category rather than the article page, so searching the category list in the ES index will not surface these.

In [66]:
CONTROVERSIAL_SUBCATS_PKL = OUTPUT_DIR / "controversial_subcats.pkl"
CONTROVERSIAL_PAGES_PKL = OUTPUT_DIR / "controversial_pages.pkl"

Took 7 min.

In [133]:
%%time

cq = CatQuery()
cq.get_members("Category:Wikipedia controversial topics")

CPU times: user 1min 14s, sys: 5.48 s, total: 1min 19s
Wall time: 6min 47s


#### Subcategories

Pull the list of subcategories that were discovered nested under the top-level category.

In [134]:
cont_subcats = pd.DataFrame(cq.categories.values())

In [135]:
len(cont_subcats)

1375

In [136]:
display_pd(cont_subcats.sample(5))

Unnamed: 0,title,pageid,ns,type
527,Category:Autobiographical articles from April 2013,38968382.0,14.0,subcat
393,Category:Articles with minor POV problems from October 2015,47971935.0,14.0,subcat
765,Category:Wikipedia articles with possible conflicts of interest from July 2019,61175827.0,14.0,subcat
290,Category:Articles with a promotional tone from May 2019,60621791.0,14.0,subcat
1087,Category:Articles with weasel words from June 2018,57552494.0,14.0,subcat


In [137]:
# All are categories
assert cont_subcats["ns"].dropna().unique() == 14
assert cont_subcats["type"].dropna().unique() == "subcat"

Many of the subcategories relate to specific dates. What are the general category areas?

In [138]:
display_pd(cont_subcats["title"].str.replace(" from \w+ \d{4}", "").drop_duplicates().sort_values())



5                      Category:All Wikipedia neutral point of view disputes
336                            Category:All articles with a promotional tone
470                            Category:All articles with minor POV problems
968                                 Category:All articles with peacock terms
1306    Category:All articles with specifically marked weasel-worded phrases
163                                Category:Articles with a promotional tone
164                                Category:Articles with minor POV problems
471                                     Category:Articles with peacock terms
1143        Category:Articles with specifically marked weasel-worded phrases
472                                      Category:Articles with weasel words
473                                       Category:Articles with wikipuffery
165                                       Category:Autobiographical articles
167       Category:Pseudoscience articles under contentious topics procedure

Map page titles to canonical URI form (eg. spaces converted to `_`).

In [None]:
# cont_subcats["name"] = cont_subcats["title"].str.replace("^Category:", "", regex=True).str.replace("_", " ")

# cont_subcats = pd.merge(cont_subcats, all_category_info[["name", "key"]], how="left", on="name")

# # Handle the root category explicitly (different format)
# cont_subcats.iloc[0, -1] = cont_subcats.iloc[0]["title"]

In [139]:
cont_subcats.to_pickle(CONTROVERSIAL_SUBCATS_PKL)

#### Pages

Pull the list of pages belonging to any subcategory under the root.

In [140]:
cont_pages = pd.DataFrame(cq.pages.values())

In [141]:
len(cont_pages)

87694

In [142]:
display_pd(cont_pages.sample(5))

Unnamed: 0,pageid,ns,title,type,categories
4256,5243787,11,Template talk:Politics of Syria,page,[Category:Wikipedia articles under general sanctions]
24161,38742369,0,Lisa Giobbi,page,"[Category:Articles with a promotional tone from March 2013, Category:All articles with a promotional tone]"
58071,53747273,0,Shrirang Godbole,page,[Category:Wikipedia articles with possible conflicts of interest from September 2021]
39421,32049685,0,Cold Rock Ice Creamery,page,"[Category:Articles with a promotional tone from April 2021, Category:All articles with a promotional tone]"
10259,68563686,1,Talk:List of anti-vaccination groups,page,[Category:Wikipedia pages about contentious topics]


The pages belong to multiple [namespaces](https://en.wikipedia.org/wiki/Wikipedia:Namespace).
In many cases, the category is applied to the Talk page (odd numbered namespace) rather than the main article page (even numbered namespace).

- The majority of these are articles or article Talk pages.
- There are some templates & categories that are also included.

In [143]:
cont_pages["namespace"] = cont_pages["ns"].map(NAMESPACES)

In [170]:
cont_pages["namespace"].value_counts()

Article           74086
Talk              13138
Template            128
Template talk       104
Category talk        76
Wikipedia talk       36
User talk            36
Wikipedia            24
Draft talk           17
Module               16
User                 14
Module talk           8
File talk             6
Portal talk           4
Help talk             1
Name: namespace, Length: 15, dtype: int64

Deduce the main page title from the corresponding Talk page title.

In [163]:
cont_pages["main_title"] = (
    cont_pages["title"]
    .str.replace("^([^:]+) talk:", "\\1:", n=1, regex=True)
    .str.removeprefix("Talk:")
)

In [171]:
cont_pages.to_pickle(CONTROVERSIAL_PAGES_PKL)

In [None]:
# cont_pages = pd.read_pickle(CONT_PAGES_PKL)

In [173]:
cont_page_catcounts = cont_pages["categories"].explode().value_counts()

In [174]:
cont_page_catcounts.head(10)

Category:All articles with a promotional tone                                         25206
Category:All articles with specifically marked weasel-worded phrases                  17770
Category:Wikipedia pages about contentious topics                                      9326
Category:All Wikipedia neutral point of view disputes                                  7656
Category:All articles with peacock terms                                               3575
Category:Wikipedia controversial topics                                                3561
Category:All articles with minor POV problems                                          1022
Category:Wikipedia articles under general sanctions                                    1010
Category:Wikipedia objectionable content                                                615
Category:Articles with specifically marked weasel-worded phrases from January 2023      465
Name: categories, Length: 10, dtype: int64

In [175]:
cont_page_catcounts.loc[["Category:Wikipedia objectionable content", "Category:Wikipedia pages about contentious topics"]]

Category:Wikipedia objectionable content              615
Category:Wikipedia pages about contentious topics    9326
Name: categories, Length: 2, dtype: int64

## Pull records for potentially controversial pages from search index

We run through the full index and pull out records for pages:

- belonging to one of the controversial categories
- containing a bad image

In [176]:
# ES index listing for potentially controversial pages
CONTROVERSIAL_INDEX = OUTPUT_DIR / "cirrussearch-controversial-content.json.gz"

In [183]:
CONTROVERSIAL_TITLES = set(cont_pages.query("namespace in ('Article', 'Talk')")["main_title"])
BAD_IMAGES = set([x.removeprefix("File:") for x in badimg])

In [184]:
len(CONTROVERSIAL_TITLES), len(BAD_IMAGES)

(86252, 936)

In [303]:
class ControversialIndexing(IndexStream):
    def _is_controversial_record(self, r):
        j = json.loads(r)
        if j["title"] in CONTROVERSIAL_TITLES:
            return True
        for img in BAD_IMAGES:
            if img in j["source_text"]:
                return True
        return False

    def _process_record(self, line, i):
        if self._is_controversial_record(line):
            self._write_to_output(line)

In [186]:
contind = ControversialIndexing()

Took 4 hours 30 min. Wrote a gzipped JSON file of 1.3 GB.

In [187]:
%%time

contind.run_with_output_file(CONTROVERSIAL_INDEX)

13220896it [4:19:10, 850.16it/s]                                                                                                                         

CPU times: user 4h 15min 50s, sys: 1min 41s, total: 4h 17min 31s
Wall time: 4h 19min 11s





In [188]:
print(f"Records kept: {contind.n_kept:,}")

Records kept: 85,740


## Explore potentially controversial pages

Our goal is to develop a strategy for recognizing pages we may want to block or downweight.
We look into options for accomplishing this by exploring the record for the subset of potentially controversial pages pulled above.

First, trim down the records by removing long or irrelevant fields to facilitate loading into memory.

In [189]:
CONTROVERSIAL_INDEX_SHORT = OUTPUT_DIR / "cirrussearch-controversial-content_reduced.json.gz"
CONTROVERSIAL_DF_PKL = OUTPUT_DIR / "controversial_records.pkl"

In [209]:
OBJECTIONABLE_TITLES = (
    cont_pages[
        cont_pages["categories"].map(lambda x: "Category:Wikipedia objectionable content" in x)
    ]["main_title"].to_list()
)
CONTENTIOUS_TITLES = (
    cont_pages[
        cont_pages["categories"].map(lambda x: "Category:Wikipedia pages about contentious topics" in x)
    ]["main_title"].to_list()
)
SANCTIONS_TITLES = (
    cont_pages[
        cont_pages["categories"].map(lambda x: "Category:Wikipedia articles under general sanctions" in x)
    ]["main_title"].to_list()
)

In [210]:
len(OBJECTIONABLE_TITLES), len(CONTENTIOUS_TITLES), len(SANCTIONS_TITLES)

(615, 9326, 1010)

In [211]:
def reduce_record(r):
    FIELDS_KEPT = ["title", "opening_text", "auxiliary_text", "category", "page_id"]

    j = json.loads(r)
    result = {k: j.get(k, "") for k in FIELDS_KEPT}
    # Remove modules (Lua snippets)
    result["template"] = [x for x in j["template"] if x.startswith("Template:")]
    result["bad_img"] = False
    for img in BAD_IMAGES:
        if img in j["source_text"]:
            result["bad_img"] = True
            break
    result["controversial"] = j["title"] in CONTROVERSIAL_TITLES
    result["contentious"] = j["title"] in CONTENTIOUS_TITLES
    result["sanctions"] = j["title"] in SANCTIONS_TITLES
    result["objectionable"] = j["title"] in OBJECTIONABLE_TITLES
    
    return json.dumps(result)


def process_controversial_records(full_index, short_index):
    with gzip.open(short_index, "wt") as fw:
        with gzip.open(full_index, "rt") as fr:
            for i, line in tqdm(enumerate(fr), total=86_000):
                fw.write(reduce_record(line) + "\n")

Took 10 min. Wrote a gzipped JSON file of 125 MB.

In [212]:
%%time

process_controversial_records(CONTROVERSIAL_INDEX, CONTROVERSIAL_INDEX_SHORT)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 85740/86000 [10:02<00:01, 142.29it/s]

CPU times: user 9min 48s, sys: 7.7 s, total: 9min 56s
Wall time: 10min 2s





In [213]:
df_cont = pd.read_json(CONTROVERSIAL_INDEX_SHORT, lines=True)

In [217]:
df_cont.sample(5)

Unnamed: 0,title,opening_text,auxiliary_text,category,page_id,template,bad_img,controversial,contentious,sanctions,objectionable
65671,Barkan Industrial Park,The Barkan Industrial Park (Hebrew: איזור התעש...,[This article may be unbalanced towards certai...,"[All articles with bare URLs for citations, Ar...",5604590,"[Template:Short description, Template:Pagetype...",False,True,True,False,False
5028,Trans Misja,Trans Misja is the fourth studio album by Poli...,[This article does not cite any sources. Pleas...,"[Articles lacking sources from August 2010, Al...",12795102,"[Template:Unreferenced, Template:Ambox, Templa...",False,True,False,False,False
62669,East West (band),East West was an American Christian rock band ...,[This article includes a list of general refer...,"[Articles with short description, Short descri...",5336083,"[Template:Short description, Template:Pagetype...",False,True,False,False,False
34627,Sokikom,Sokikom (so-kee-kom) is a math program where e...,[This article has multiple issues. Please help...,[Articles with a promotional tone from June 20...,31626507,"[Template:Multiple issues, Template:Ambox, Tem...",False,True,False,False,False
71253,Ray Grainger,"Raymond ""Ray"" Grainger is the co-founder and C...",[This article may contain wording that promote...,"[CS1 maint: url-status, Articles with wikipuff...",61814373,"[Template:Puffery, Template:Ambox, Template:In...",False,True,False,False,False


In [224]:
(
    df_cont
    .groupby(["controversial", "contentious", "sanctions", "objectionable", "bad_img"])
    .size()
    .to_frame(name="count")
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
controversial,contentious,sanctions,objectionable,bad_img,Unnamed: 5_level_1
False,False,False,False,True,196
True,False,False,False,False,75530
True,False,False,False,True,7
True,False,False,True,False,465
True,False,False,True,True,76
True,False,True,False,False,790
True,False,True,True,False,20
True,True,False,False,False,8546
True,True,False,False,True,1
True,True,False,True,False,21


In [226]:
for c in ["controversial", "contentious", "sanctions", "objectionable", "bad_img"]:
    print(f"{c} count: {df_cont[c].sum():,}")

controversial count: 85,544
contentious count: 8,656
sanctions count: 897
objectionable count: 584
bad_img count: 282


Writes ~410 MB.

In [227]:
df_cont.to_pickle(CONTROVERSIAL_DF_PKL)

### Pages about objectionable topics or containing a bad image

These pages are candidates for being blocked outright, as many contain graphic sexual or violent content.

In [228]:
df_obj = df_cont.query("objectionable or bad_img")

In [229]:
len(df_obj)

789

Take a look at the top (visible) categories these pages belong to.

- Majority are related to sexuality

In [254]:
obj_cats = (
    df_obj["category"].explode()
    .value_counts()
    .reset_index(name="count")
    .rename(columns={"index": "name"})
    .assign(prop=lambda d: d["count"] / len(df_obj))
    .merge(df_cats, on="name", how="left")
)

In [256]:
# All categories are in the full category list
assert obj_cats["hidden"].isna().sum() == 0

How many unique visible categories are there?

In [269]:
len(obj_cats)

6157

In [310]:
display_pd(obj_cats.query("~hidden")[["name", "count", "prop"]].head(20))

Unnamed: 0,name,count,prop
35,Sexual acts,40,0.050697
56,Living people,25,0.031686
77,Sex positions,20,0.025349
93,English profanity,16,0.020279
102,English words,14,0.017744
105,Penis,13,0.016477
107,Sexual fetishism,13,0.016477
111,Sexual slang,12,0.015209
118,Pornography terminology,12,0.015209
126,Human sexuality,11,0.013942


For all these categories, find the visible parent categories they belong to.
Look at that distribution for a higher-level view.

- Along with sexuality, we see some categories related to political ideology and violence.

In [276]:
display_pd(
    obj_cats["parents_visible"].explode()
    .value_counts()
    .reset_index(name="count")
    .rename(columns={"index": "name"})
    .assign(prop=lambda d: d["count"] / len(df_obj))
    .head(30)
)

Unnamed: 0,name,count,prop
0,Articles with authority control information,62,0.07858
1,Births by year,58,0.073511
2,Stub categories,44,0.055767
3,Deaths by year,42,0.053232
4,Human sexuality,26,0.032953
5,Organizations designated as terrorist by designator,24,0.030418
6,Sexuality and society,23,0.029151
7,Films by year,22,0.027883
8,Songs by songwriter,21,0.026616
9,Songs by artist,17,0.021546


What does the distribution of templates look like?

- Looking across the full list of templates, these look less informative in helping us identify objectionable content.

In [312]:
df_obj["template"].explode().nunique()

3399

In [311]:
display_pd(
    df_obj["template"].explode()
    .value_counts()
    .reset_index(name="count")
    .rename(columns={"index": "name"})
    .assign(prop=lambda d: d["count"] / len(df_obj))
    .head(20)
)

Unnamed: 0,name,count,prop
0,Template:Main other,788,0.998733
1,Template:Reflist/styles.css,762,0.965779
2,Template:Reflist,762,0.965779
3,Template:Short description,715,0.90621
4,Template:Short description/lowercasecheck,715,0.90621
5,Template:SDcat,715,0.90621
6,Template:Pagetype,710,0.899873
7,Template:Cite web,675,0.855513
8,Template:Hlist/styles.css,658,0.833967
9,Template:Navbox,571,0.723701


### Pages about contentious topics

These pages are subject to disruptive editing and stricter editorial rules or restrictions.

- The set of topics is broad, making it difficult to select representative categories.

In [262]:
df_contentious = df_cont.query("contentious")

In [263]:
len(df_contentious)

8656

Take a look at the top (visible) categories these pages belong to.

In [264]:
contentious_cats = (
    df_contentious["category"].explode()
    .value_counts()
    .reset_index(name="count")
    .rename(columns={"index": "name"})
    .assign(prop=lambda d: d["count"] / len(df_obj))
    .merge(df_cats, on="name", how="left")
)

In [270]:
len(contentious_cats)

41064

There are a few categories not appearing in the full category list. Just consider these to be visible categories.

In [271]:
print(f"Unknown category: {contentious_cats['hidden'].isna().sum()}")

Unknown category: 15


In [None]:
# contentious_cats.query("hidden.isna()")[["name", "count"]]

In [272]:
contentious_cats["hidden"] = contentious_cats["hidden"].fillna(False).astype(bool)

Topics that emerge are:
    
- Conflicts, especially in the Middle East
- COVID-19
- Influential figures, such as politicians & writers
- Members of the LGBTQ community

In [314]:
display_pd(contentious_cats.query("~hidden")[["name", "count", "prop"]].head(20))

Unnamed: 0,name,count,prop
9,Living people,1616,2.048162
45,Municipalities of the State of Palestine,377,0.47782
47,Arab villages depopulated during the 1948 Arab–Israeli War,342,0.43346
56,Villages in the West Bank,278,0.352345
71,COVID-19 pandemic by country,213,0.269962
72,Transgender women,213,0.269962
78,21st-century American politicians,197,0.249683
93,21st-century LGBT people,164,0.207858
126,Israeli settlements in the West Bank,100,0.126743
138,American conspiracy theorists,95,0.120406


For all these categories, find the visible parent categories they belong to.
Look at that distribution for a higher-level view.

In [278]:
display_pd(
    contentious_cats["parents_visible"].explode()
    .value_counts()
    .reset_index(name="count")
    .rename(columns={"index": "name"})
    .assign(prop=lambda d: d["count"] / len(df_obj))
    .head(30)
)

Unnamed: 0,name,count,prop
0,Stub categories,267,0.338403
1,2020 by country,242,0.306717
2,2021 by country,235,0.297845
3,Treaties by country,199,0.252218
4,Births by year,194,0.245881
5,COVID-19 pandemic by country,189,0.239544
6,Disease outbreaks by country,160,0.202788
7,Deaths by year,149,0.188847
8,Wars by country,88,0.111534
9,2022 by country,88,0.111534


Similarly, the distribution of templates is not very informative for our purposes.

In [315]:
df_contentious["template"].explode().nunique()

13698

In [281]:
display_pd(df_contentious["template"].explode().value_counts()[:20])

Template:Main other                          8639
Template:Reflist                             8422
Template:Reflist/styles.css                  8422
Template:Short description                   7703
Template:Short description/lowercasecheck    7703
Template:SDcat                               7703
Template:Pagetype                            7652
Template:Cite web                            7463
Template:Hlist/styles.css                    7409
Template:Cite news                           6430
Template:Navbox                              4961
Template:Ns has subpages                     4923
Template:FULLROOTPAGENAME                    4923
Template:Dated maintenance category          4923
Template:DMCA                                4917
Template:Cite book                           4689
Template:Yesno                               4579
Template:Template other                      4112
Template:Category handler                    4072
Template:Plainlist/styles.css                4048


Take a look at top-level categories in the list.

In [381]:
contentious_cats_info = find_matching_categories(catlist=contentious_cats["name"], nonempty=True)

In [391]:
display_pd(
    contentious_cats_info.query("~parent_matches").sample(20)
    .reset_index(drop=True)[["name", "subcats_visible"]]
)

Unnamed: 0,name,subcats_visible
0,Cuyahoga Community College alumni,[Tri-C Triceratops baseball players]
1,Nicki Minaj,"[Nicki Minaj album covers, Nicki Minaj albums, Nicki Minaj audio samples, Nicki Minaj concert tours, Nicki Minaj songs, Songs written by Nicki Minaj]"
2,Universiade gold medalists in athletics (track and field),[]
3,Opposition Platform — For Life politicians,[]
4,"21st century in Rochester, New York",[]
5,Canadian social commentators,[]
6,2020 in North Dakota,"[2020 North Dakota elections, 2020 disestablishments in North Dakota, 2020 in sports in North Dakota]"
7,Colorado law,"[Cannabis law in Colorado, Capital punishment in Colorado, Colorado General Assembly, Colorado ballot measures, Colorado state case law, Colorado state courts, Colorado statutes, Constitution of Colorado, Courthouses in Colorado, Crime in Colorado, Criminals from Colorado, LGBT rights in Colorado, Law enforcement in Colorado, Law firms based in Colorado, Law schools in Colorado, Legal history of Colorado]"
8,Argentine television personalities,"[Argentine television chefs, Argentine television journalists, Argentine television presenters, Argentine television talk show hosts, Participants in Argentine reality television series]"
9,Members of the Assembly of Experts,[Speakers of the Assembly of Experts]


## Develop list of categories to block

Here we put together a list of categories to block by inspecting the categories associated with objectionable pages.

Looking through the categories, we observe some high-level topics emerging.
For these categories, we explored subcategories and parent categories to get an idea of the level of generality to target.

Based on these observations, we identify the following topics to consider blocking.
For each topic, we build a list of Wikipedia categories, and we plan to block pages that belong to any of these categories.

In order to make the category listing comprehensive and reproducible, the list is built as follows.
For each topic, we identify a collection of seed terms.
The list will contain all categories which match the seed terms, as well as all of their subcategories, subject to possible exclusions described below. (In some cases, we may cut off subcategories at a certain depth, to avoid branching off into related topics which maybe should not be blocked).

The seed terms and exclusions were identified by experimenting with different search terms in `find_matching_categories()` and `category_bfs()`.

__Topics to consider blocking:__

- Sexuality
    * sex acts, crime or violence-related
    * not health-related or societal
- Pornography/erotica
    * not anti-pornography
- Profanity
- Pejoratives & slurs
- Cruelty
    * torture, child abuse
    * not anti-abuse, works of fiction
- Hateful ideology

The following topics may also be considered objectionable, but may not be good candidates for blocking due to their breadth or historical context. We can consider downweighting them instead:

- Prejudice/discrimination
- Violence
    * including terrorism, genocide, mass shootings, targeted
- Abuse
    * including bullying, harassment

In [619]:
BLOCKLIST_CSV = "blocklist_cats.csv"

In [536]:
seed_categories = [
    "Profanity",
    # Cruelty:
    "Torture",
    "Child abuse",
    # Sexuality:
    "Sex manuals",
    "Incest",
    "Sexual emotions",
    "Prostitution",
    "Sexuality-related lists",
    "Machine sex",
    "Orgasm",
    "Sexual fetishism",
    "Pornography",
    "Paraphilias",
    "Sexual slang",
    "Personal lubricants",
    "Sexual fantasies",
    "Sex toys",
    "Sexual violence",
    "Sexual acts",
]

seed_categories_level_restricted = {
    # Stop searching after level 4
    4: ["White nationalism"],
}

category_matching_regexes = [
    # regex to catch separate categories for different groups
    "\\bslurs",
    # regex to catch separate categories for different groups
    "\\bpejorative",
    "\\bpornograph",
    "\\berotic",
]

cat_match_re = "|".join(category_matching_regexes)

exclude_categories = [
    # Profanity exclusions:
    # mainly TV shows and movies
    "Works about profanity",
    # Torture exclusions:
    # mainly historical
    "Castrated people",
    "Crucifixion",
    # people, prosthetics, sports
    "Amputations",
    "Fictional torturers",
    "Works about torture",
    # Child abuse exclusions:
    "Child abuse law",
    "Child abuse-related organizations",
    "Child labour",
    "School bullying",
    "Displacement of indigenous children",
    "Works about child abuse",
    "Anti-pedophile activism",
    "Feral children",
    "Fiction about child murder",
    "Filicides",
    "QAnon",
    "Fictional murderers of children",
    # mainly historical/mythological
    "Child sacrifice",
    # Slurs/Pejorative exclusions:
    # cultural/historical
    "Barbarians",
    # White nationalism exclusions:
    "Works about apartheid",
    # Erotica exclusions:
    "One Thousand and One Nights",
    "Erotic Liquid Culture members",
    "Swedish Erotica members",
    # Sexuality exclusions:
    "Aphrodite",
    "Game of Thrones",
]

exclude_regexes = [
    # Torture exclusions:
    "\\banti-torture\\b",
    "\\btorture victims?\\b",
    # Slurs/Pejorative exclusions:
    # cultural/historical
    "\\beskimo",
    # Pornography exclusions:
    "\\bNew Pornographers\\b",
    "\\banti\W(child )?porn",
    "\\blaw.+\\bporn",
    "\\bporn.+\\blaw",
    # non-porn
    "\\bsex comedy\\b",
    # Sexuality exclusions:
    "\\blaw.+\\bprostitut",
    "\\bprostitut.+\\blaw",
    # "\\bfiction",
    # "\\blitera",
    "\\banti\Wprostitut",
    "\\bmytholog",
]

exclude_re = "|".join(exclude_regexes)

Given this seed list, what is the full list of categories that would be blocked?

In [546]:
def build_blocklist(seed_cats, seed_re, exclude_cats, exclude_re, seed_cats_level):
    bl_rows = category_bfs(
        seed_cats=seed_cats,
        seed_re=seed_re,
        exclude_cats=exclude_cats,
        exclude_re=exclude_re,
    )
    
    bl_rows_level = pd.DataFrame()
    for max_level, cats in seed_cats_level.items():
        result = category_bfs(seed_cats=cats, exclude_cats=exclude_cats, exclude_re=exclude_re, max_level=max_level)
        result["max_level"] = max_level
        bl_rows_level = pd.concat([bl_rows_level, result], ignore_index=True)
        bl_rows_level = bl_rows_level.drop_duplicates(subset="name")
    
    bl_final = pd.concat([bl_rows, bl_rows_level], ignore_index=True)
    bl_final = bl_final.drop_duplicates(subset="name")
    
    assert bl_final["name"].value_counts().unique() == 1
    
    return bl_final

Took ~15 sec.

In [547]:
%%time

blocklist_final = build_blocklist(
    seed_cats=seed_categories,
    seed_re=cat_match_re,
    exclude_cats=exclude_categories,
    exclude_re=exclude_re,
    seed_cats_level=seed_categories_level_restricted,
)

CPU times: user 12.2 s, sys: 1.29 s, total: 13.5 s
Wall time: 13.9 s


In [548]:
len(blocklist_final)

3385

In [550]:
blocklist_final.seed.value_counts()

Erotic                     1058
Prostitution                548
Sexual violence             539
Child abuse                 391
White nationalism           298
Torture                     219
Pornograph                   74
Paraphilias                  67
Sexual acts                  40
Pejorative                   33
Sexual slang                 31
pornograph                   24
Sex toys                     14
Profanity                    11
Sexual emotions              10
Sexuality-related lists       9
Orgasm                        5
erotic                        4
slurs                         4
Sex manuals                   3
Personal lubricants           2
Slurs                         1
Name: seed, Length: 22, dtype: int64

In [618]:
blocklist_final.to_csv(BLOCKLIST_CSV, index=False)

How many articles are covered by this blocklist?
This counts articles belonging to each subcategory. Since pages can belong to multiple categories, this will be an overestimate.

* at most 64,000 pages (~1% of English Wikipedia)

In [563]:
blocklist_cat_info = df_cats.merge(blocklist_final, on="name", how="inner")

In [564]:
print(f"Num pages: {blocklist_cat_info['num_pages'].sum():,}")

Num pages: 64,334


How many categories are non-empty?

In [565]:
len(blocklist_cat_info.query("num_pages + num_subcats > 0"))

3312

In [567]:
blocklist_final.sample(30)

Unnamed: 0,name,seed,parent,level,max_level
966,Alt porn,Erotic,Pornography by genre,3,
1546,Egyptian erotic artists,Erotic,Erotic artists by nationality,4,
1912,Hentai creators,Erotic,Hentai,4,
115,Prostitution by country,Prostitution,Prostitution,1,
1746,Actors in gay pornographic films,Erotic,Gay male pornography,4,
3090,Boer nationalism,White nationalism,White nationalism,1,4.0
66,Child abandonment,Child abuse,Child abuse,1,
596,Torture in Syria,Torture,Torture by country,2,
1428,Filicides in the United Kingdom,Child abuse,British murderers of children,4,
1243,Rape in Bangladesh,Sexual violence,Rape by country,3,


## Pull records matching against the blocklist

In [568]:
BLOCKED_PAGES_INDEX = OUTPUT_DIR / "cirrussearch-content-blocked.json.gz"

In [576]:
class BlockedIndexing(IndexStream):
    def __init__(self, blocked_cats):
        self.blocked_cats = pd.Series(blocked_cats)

    def _process_record(self, line, i):
        j = json.loads(line)
        if self.blocked_cats.isin(j["category"]).any():
            self._write_to_output(line)

Took ~30 min.

In [577]:
%%time

bi = BlockedIndexing(blocklist_final["name"])
bi.run_with_output_file(BLOCKED_PAGES_INDEX)

13220896it [31:54, 6905.67it/s]                                                                                                                          

CPU times: user 30min 19s, sys: 34.1 s, total: 30min 53s
Wall time: 31min 54s





In [578]:
print(f"Records kept: {bi.n_kept:,}")

Records kept: 30,798


In [782]:
df_bl = pd.read_json(BLOCKED_PAGES_INDEX, lines=True)

# Appendix

## Generate a sample from the full index

Pull a 1% sample that is more manageable for exploratory analysis, ignoring the initial `index` objects for each entry.

In [582]:
SAMPLE_INDEX = OUTPUT_DIR / "enwiki-20230123-cirrussearch-sample.json.gz"

In [586]:
class IndexSampler(IndexStream):
    def _process_record(self, line, i):
        if random.random() < 0.01:
            self._write_to_output(line)

Took ~10 min. Wrote a gzipped JSON of 400 MB.

In [587]:
%%time

idxs = IndexSampler()
idxs.run_with_output_file(SAMPLE_INDEX)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████▊| 13220896/13250000 [09:11<00:01, 23953.17it/s]

CPU times: user 8min 23s, sys: 19.6 s, total: 8min 42s
Wall time: 9min 11s





In [588]:
print(f"Records kept: {idxs.n_kept:,}")

Records kept: 66,621


## Download bad word lists to consider for filtering

We will try out using some lists of "bad words" to detect objectionable Wikipedia content.

These lists are quite broad and contain a number of terms that are either generally not "bad" or whose "badness" either depends on context. If we use this approach, we would need to curate a much more targeted list.

In [589]:
WORDLISTS_JSON = OUTPUT_DIR / "word_lists.json"

# Older word list, pretty widly used for filtering online comments
LDNOOBW_LIST_SRC = "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
# Word list from CMU research
CMU_LIST_SRC = "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
# Word list used by Github Copilot
COPILOT_LIST_SRC = "https://moyix.net/~moyix/copilot_slurs_rot13.txt"

In [590]:
ldnoobw_list = requests.get(LDNOOBW_LIST_SRC).text.splitlines()

In [591]:
cmu_list = requests.get(CMU_LIST_SRC).text.strip().splitlines()

In [607]:
copilot_list = requests.get(COPILOT_LIST_SRC).text.split("===")[0]
copilot_list = codecs.decode(copilot_list, "rot_13").splitlines()
copilot_list = [x for x in sorted(copilot_list) if not x.startswith("<")]

In [608]:
wordlists = {
    "ldnoobw": ldnoobw_list,
    "cmu": cmu_list,
    "copilot": copilot_list,
}

In [609]:
with open(WORDLISTS_JSON, "w") as f:
    json.dump(wordlists, f)

In [610]:
{k: len(v) for k, v in wordlists.items()}

{'ldnoobw': 403, 'cmu': 1383, 'copilot': 1023}

Intersection & union sizes:

In [611]:
len(set(wordlists["ldnoobw"]) & set(wordlists["cmu"]) & set(wordlists["copilot"]))

50

In [612]:
print(len(set(wordlists["ldnoobw"]) & set(wordlists["cmu"])))
print(len(set(wordlists["ldnoobw"]) & set(wordlists["copilot"])))
print(len(set(wordlists["cmu"]) & set(wordlists["copilot"])))

133
66
383


In [613]:
len(set(wordlists["ldnoobw"]) | set(wordlists["cmu"]) | set(wordlists["copilot"]))

2277

In [614]:
ALL_BADWORDS = set(wordlists["ldnoobw"]) | set(wordlists["cmu"]) | set(wordlists["copilot"])

In [615]:
ALL_BADWORDS_RE = re.compile("(" + "|".join([fr"\b{re.escape(w)}\b" for w in ALL_BADWORDS]) + ")")