## Downloading The Jigsaw Unintended Bias in Toxicity Classification Dataset

In [1]:
!pip install kaggle
import pandas as pd
from google.colab import files



In [2]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"moatazhamza","key":"9cc02fddafac738685ae780de9b9ef1a"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p /content/jigsaw_data
!unzip /content/jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip -d /content/jigsaw_data

Downloading jigsaw-unintended-bias-in-toxicity-classification.zip to /content/jigsaw_data
 97% 702M/723M [00:04<00:00, 267MB/s]
100% 723M/723M [00:04<00:00, 185MB/s]
Archive:  /content/jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip
  inflating: /content/jigsaw_data/all_data.csv  
  inflating: /content/jigsaw_data/identity_individual_annotations.csv  
  inflating: /content/jigsaw_data/sample_submission.csv  
  inflating: /content/jigsaw_data/test.csv  
  inflating: /content/jigsaw_data/test_private_expanded.csv  
  inflating: /content/jigsaw_data/test_public_expanded.csv  
  inflating: /content/jigsaw_data/toxicity_individual_annotations.csv  
  inflating: /content/jigsaw_data/train.csv  


In [5]:
import pandas as pd

In [45]:
df = pd.read_csv("/content/jigsaw_data/train.csv")

In [46]:
df.sample(5)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
473761,825080,0.0,stock brokers need to think on their feet in ...,0.0,0.0,0.0,0.0,0.0,,,...,161282,approved,1,0,0,1,0,0.0,0,4
214474,504548,0.0,I find it weird that we sometimes get other pe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,147490,approved,0,0,0,1,0,0.0,10,4
1687125,6190351,0.0,The secret is in the sauce.,0.0,0.0,0.0,0.0,0.0,,,...,391427,approved,0,0,1,0,0,0.0,0,4
212310,501844,0.0,Were there any savings during the years of wea...,0.0,0.0,0.0,0.0,0.0,,,...,146547,approved,0,0,0,0,0,0.0,0,4
439,240510,0.0,Episode 1: A New Hope... the masterpiece that ...,0.0,0.0,0.0,0.0,0.0,,,...,32846,approved,0,0,0,0,0,0.0,0,4


## Cleaning

In [47]:
data = df[["comment_text", "target", "male", "female"]].copy()

In [48]:
len(data)

1804874

In [49]:
data.sample(5)

Unnamed: 0,comment_text,target,male,female
1745134,Yes I understand what he did. I should have m...,0.0,,
565248,That's akin to asking the following:\n\nCan yo...,0.0,,
552193,"It goes much deeper than that Gary, and that i...",0.0,,
1164360,"This wasn't a Trump insult--""she's a pig"" or ""...",0.6,0.0,0.1
217377,"Seems like there is tax on alcohol, cigarettes...",0.0,,


Replacing NaN values with 0.0

In [50]:
data[["male", "female"]] = data[["male", "female"]].fillna(0.0)

Dropping duplicates

In [51]:
data = data.drop_duplicates().reset_index(drop=True)

In [52]:
len(data)

1786304

Strip leading/trailing whitespace and normalize Unicode (NFC)

In [53]:
import unicodedata
def strip_and_normalize_nfc(text: str) -> str:
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    text = text.strip()
    return unicodedata.normalize("NFC", text)

Replace URLs with <URL> and user handles with <USER>

In [54]:
import re
URL_RE  = re.compile(r'((?:https?://|http?://|www\.)\S+)', flags=re.IGNORECASE)
USER_RE = re.compile(r'(?<!\w)@\w+')

def replace_urls_and_users(text: str) -> str:
    text = URL_RE.sub("<URL>", text)
    text = USER_RE.sub("<USER>", text)
    return text

Remove \n and similar, then collapse excess whitespace to single spaces

In [55]:
WS_RE = re.compile(r"\s+")

def remove_newlines_and_collapse_ws(text: str) -> str:
    # replace any whitespace run (including \n, \t) with a single space
    return WS_RE.sub(" ", text).strip()

Cleaning:

In [56]:
# 1. strip + normalize
data["comment_text"] = data["comment_text"].apply(strip_and_normalize_nfc)

# 2. replace URLs and user handles
data["comment_text"] = data["comment_text"].apply(replace_urls_and_users)

# 3. remove newlines + collapse whitespace
data["comment_text"] = data["comment_text"].apply(remove_newlines_and_collapse_ws)

# 4. drop true row duplicates across all columns
data = data.drop_duplicates().reset_index(drop=True)

data = data.rename(columns={"comment_text": "comment"})

In [57]:
data = data.dropna(subset=["comment"])                       # drop NaN
data = data[data["comment"].str.strip().astype(bool)]        # drop empty/whitespace

In [66]:
data.sample(5)

Unnamed: 0,comment,target,male,female
960907,Poster child (literally) for the deplorable an...,0.5,0.0,0.0
853896,Think about this a bit rather than donning you...,0.0,1.0,1.0
1538222,Merely donating one's time and money isn't wha...,0.0,0.0,0.0
496889,To convert the entire country into a fifth-rat...,0.0,0.0,0.0
373228,I'm thinking the Whistler bar at which he was ...,0.2,0.0,0.0


In [67]:
len(data)

1782961

In [68]:
data = data[
    ((data["male"] > 0.5) & (data["female"] == 0)) |
    ((data["female"] > 0.5) & (data["male"] == 0)) |
    ((data["female"] == 0) & (data["male"] == 0))
]

In [69]:
len(data)

1710404

In [70]:
filtered = data[
    ((data["male"] > 0.5) & (data["female"] == 0)) |
    ((data["female"] > 0.5) & (data["male"] == 0))
]

In [71]:
len(filtered)

47610

In [72]:
import pandas as pd

# define each group
g1 = data[(data["male"] > 0.5) & (data["female"] == 0) & (data["target"] > 0.5)]
g2 = data[(data["male"] > 0.5) & (data["female"] == 0) & (data["target"] <= 0.5)]
g3 = data[(data["male"] == 0) & (data["female"] > 0.5) & (data["target"] > 0.5)]
g4 = data[(data["male"] == 0) & (data["female"] > 0.5) & (data["target"] <= 0.5)]
g5 = data[(data["male"] == 0) & (data["female"] == 0) & (data["target"] > 0.5)]
g6 = data[(data["male"] == 0) & (data["female"] == 0) & (data["target"] <= 0.5)]

# sample 500 rows from each group (random_state for reproducibility)
g1_sample = g1.sample(n=500, random_state=42)
g2_sample = g2.sample(n=500, random_state=42)
g3_sample = g3.sample(n=500, random_state=42)
g4_sample = g4.sample(n=500, random_state=42)
g5_sample = g5.sample(n=500, random_state=42)
g6_sample = g6.sample(n=500, random_state=42)

# combine into final dataset
final_sample = pd.concat([g1_sample, g2_sample, g3_sample, g4_sample, g5_sample, g6_sample])

# reset index
final_sample = final_sample.reset_index(drop=True)

In [73]:
import numpy as np

final_sample["gender"] = np.select(
    [
        (final_sample["female"] > 0.5) & (final_sample["male"] == 0),
        (final_sample["male"] > 0.5) & (final_sample["female"] == 0),
        (final_sample["male"] == 0) & (final_sample["female"] == 0)
    ],
    ["female", "male", "general"],
    default="other"  # in case something doesn't match
)

In [74]:
final_sample = final_sample.drop(columns=["male", "female"])

In [75]:
len(final_sample)

3000

In [78]:
final_sample.sample(5)

Unnamed: 0,comment,target,gender
220,I don't believe Trump has the capacity for beh...,0.7,male
113,Milo is an openly gay man purports to be a Cat...,0.7,male
2242,You are the most foolish person in Canada. It ...,0.762712,general
1828,"The ""title"" of the article is wrong, misleadin...",0.0,female
1890,"Wait. Lemme get this right, Justin. Thanks to ...",0.4,female


In [79]:
final_sample.to_csv("subdataset.csv", index=False)