<a href="https://colab.research.google.com/github/mahb97/wake-quiz-qnlp/blob/main/basic_ogden_bias_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic English Bias Analysis

This notebook analyses Ogden’s Basic English vocabulary using the gender and bias annotations defined in `basic_ogden_annotations.csv`. It has three aims:

1. Summarise the distribution of semantic domains, gender codings, and bias types across the Basic list.
2. Produce a small set of tables and plots that document how Ogden’s lexicon encodes gender, family, labour, and authority.
3. Export helper artefacts (JSON mappings and filtered vocabularies) for use in the Wake Quiz QNLP Playground diagnostics and the `plain_basic` style regime.


In [None]:
# imports/paths
import pandas as pd
from pathlib import Path
import json

import matplotlib.pyplot as plt

DATA_DIR = Path("data/processed")
ANN_PATH = DATA_DIR / "basic_ogden_annotations.csv"
ANN_PATH

In [None]:
# csv
ann = pd.read_csv(ANN_PATH)

# word, section, semantic_domain, gender_coding, bias_type, severity, notes

ann.head()
ann.columns

In [None]:
# basic counts
print("Total entries:", len(ann))
print("\nBy section:")
display(ann["section"].value_counts())

print("\nBy semantic_domain:")
display(ann["semantic_domain"].value_counts())

print("\nBy gender_coding:")
display(ann["gender_coding"].value_counts())

print("\nBy severity:")
display(ann["severity"].value_counts())

In [None]:
# bias type expansion
def explode_tags(df, col="bias_type"):
    rows = []
    for _, row in df.iterrows():
        tags = str(row[col]).strip()
        if not tags or tags.lower() in {"none", "nan"}:
            rows.append({**row, col: "none"})
        else:
            for t in [x.strip() for x in tags.split(",") if x.strip()]:
                rows.append({**row, col: t})
    return pd.DataFrame(rows)

bias_exploded = explode_tags(ann, "bias_type")

bias_exploded["bias_type"].value_counts()

In [None]:
# X table summaries
pd.crosstab(ann["section"], ann["gender_coding"])

In [None]:
pd.crosstab(ann["semantic_domain"], ann["gender_coding"])

In [None]:
pd.crosstab(bias_exploded["bias_type"], bias_exploded["gender_coding"])

In [None]:
# basic bar
plt.figure()
ann["gender_coding"].value_counts().plot(kind="bar")
plt.title("Gender coding distribution in Basic English")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure()
bias_exploded["bias_type"].value_counts().plot(kind="bar")
plt.title("Bias type distribution in Basic English")
plt.ylabel("Count")
plt.show()

In [None]:
# basic slicers (for now)
fem_opposites = ann[
    (ann["gender_coding"].str.contains("fem", case=False, na=False))
    & (ann["section"] == "qualities_opposites")
]

fem_opposites[["word", "section", "bias_type", "severity", "notes"]]

In [None]:
# export
word2meta = {}

for _, row in ann.iterrows():
    word = row["word"]
    word2meta[word] = {
        "section": row["section"],
        "semantic_domain": row["semantic_domain"],
        "gender_coding": row["gender_coding"],
        "bias_type": row["bias_type"],
        "severity": int(row["severity"]),
    }

out_path = DATA_DIR / "basic_ogden_annotations.json"
out_path.write_text(json.dumps(word2meta, indent=2), encoding="utf-8")
out_path

In [None]:
# allowed vocab
allowed = ann[ann["severity"] <= 1]["word"].drop_duplicates().sort_values()

allowed_path = DATA_DIR / "basic_plain_allowed.txt"
allowed_path.write_text("\n".join(allowed), encoding="utf-8")

len(allowed), allowed_path