## Downloading The Jigsaw Unintended Bias in Toxicity Classification Dataset

In [1]:
!pip install kaggle
import pandas as pd
from google.colab import files



In [2]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"moatazhamza","key":"9cc02fddafac738685ae780de9b9ef1a"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p /content/jigsaw_data
!unzip /content/jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip -d /content/jigsaw_data

Downloading jigsaw-unintended-bias-in-toxicity-classification.zip to /content/jigsaw_data
 97% 702M/723M [00:04<00:00, 267MB/s]
100% 723M/723M [00:04<00:00, 185MB/s]
Archive:  /content/jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip
  inflating: /content/jigsaw_data/all_data.csv  
  inflating: /content/jigsaw_data/identity_individual_annotations.csv  
  inflating: /content/jigsaw_data/sample_submission.csv  
  inflating: /content/jigsaw_data/test.csv  
  inflating: /content/jigsaw_data/test_private_expanded.csv  
  inflating: /content/jigsaw_data/test_public_expanded.csv  
  inflating: /content/jigsaw_data/toxicity_individual_annotations.csv  
  inflating: /content/jigsaw_data/train.csv  


In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("/content/jigsaw_data/train.csv")

In [7]:
df.sample(5)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
1513627,5973562,0.0,Call it climate change or not- doesn't matter....,0.0,0.0,0.0,0.0,0.0,,,...,379073,approved,0,0,0,2,0,0.0,0,4
1719366,6229928,0.0,Zoolander's affirmative action agenda was evid...,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,...,393516,approved,0,0,0,5,1,0.0,4,6
205216,492901,0.0,Had those 8000 extra jobs never been created i...,0.0,0.0,0.0,0.0,0.0,,,...,146691,approved,0,0,0,0,0,0.0,0,4
1253564,5647393,0.0,Name a government program that works as promised?,0.0,0.0,0.0,0.0,0.0,,,...,358590,approved,0,0,0,0,1,0.0,0,4
84204,345644,0.7,You'll just get shot. Crooks don't follow the...,0.1,0.0,0.0,0.2,0.7,,,...,138471,approved,0,0,0,1,0,0.0,0,10


## Cleaning

In [8]:
data = df[["comment_text", "target", "male", "female"]].copy()

In [9]:
len(data)

1804874

In [10]:
data.sample(5)

Unnamed: 0,comment_text,target,male,female
979347,"As an addendum.\n""The Iraq Resolution (formall...",0.2,,
945975,tragic mistake?\n\ncrashing a plane into a mou...,0.0,,
906937,These people could have had unpleasant experie...,0.0,,
1307887,"LOL, here you are, just worried about who leak...",0.0,,
650939,Make tomatoes and cheeseburgers 🍔 illegal beca...,0.166667,,


Replacing NaN values with 0.0

In [11]:
data[["male", "female"]] = data[["male", "female"]].fillna(0.0)

Dropping duplicates

In [12]:
data = data.drop_duplicates().reset_index(drop=True)

In [13]:
len(data)

1786304

Strip leading/trailing whitespace and normalize Unicode (NFC)

In [14]:
import unicodedata
def strip_and_normalize_nfc(text: str) -> str:
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    text = text.strip()
    return unicodedata.normalize("NFC", text)

Replace URLs with <URL> and user handles with <USER>

In [15]:
import re
URL_RE  = re.compile(r'((?:https?://|http?://|www\.)\S+)', flags=re.IGNORECASE)
USER_RE = re.compile(r'(?<!\w)@\w+')

def replace_urls_and_users(text: str) -> str:
    text = URL_RE.sub("<URL>", text)
    text = USER_RE.sub("<USER>", text)
    return text

Remove \n and similar, then collapse excess whitespace to single spaces

In [16]:
WS_RE = re.compile(r"\s+")

def remove_newlines_and_collapse_ws(text: str) -> str:
    # replace any whitespace run (including \n, \t) with a single space
    return WS_RE.sub(" ", text).strip()

Cleaning:

In [17]:
# 1. strip + normalize
data["comment_text"] = data["comment_text"].apply(strip_and_normalize_nfc)

# 2. replace URLs and user handles
data["comment_text"] = data["comment_text"].apply(replace_urls_and_users)

# 3. remove newlines + collapse whitespace
data["comment_text"] = data["comment_text"].apply(remove_newlines_and_collapse_ws)

# 4. drop true row duplicates across all columns
data = data.drop_duplicates().reset_index(drop=True)

data = data.rename(columns={"comment_text": "comment"})

In [18]:
data = data.dropna(subset=["comment"])                       # drop NaN
data = data[data["comment"].str.strip().astype(bool)]        # drop empty/whitespace

In [19]:
data.sample(5)

Unnamed: 0,comment,target,male,female
209262,"Please, just try to understand. You and others...",0.166667,0.0,0.0
1646672,I seem to recall that Mike Duffy indeed compli...,0.0,0.0,0.0
337905,Why would anyone in their right mind want to b...,0.833333,0.0,0.0
1644739,It's pointless. Liberals have their heads so f...,0.0,0.0,0.0
1749994,We are all proud of our boys. We are just sick...,0.1,0.166667,0.0


In [20]:
len(data)

1782961

In [21]:
data = data[
    ((data["male"] > 0.5) & (data["female"] == 0)) |
    ((data["female"] > 0.5) & (data["male"] == 0)) |
    ((data["female"] == 0) & (data["male"] == 0))
]

In [22]:
len(data)

1710404

In [23]:
filtered = data[
    ((data["male"] > 0.5) & (data["female"] == 0)) |
    ((data["female"] > 0.5) & (data["male"] == 0))
]

In [24]:
len(filtered)

47610

In [25]:
import pandas as pd

# define each group
g1 = data[(data["male"] > 0.5) & (data["female"] == 0) & (data["target"] > 0.5)]
g2 = data[(data["male"] > 0.5) & (data["female"] == 0) & (data["target"] <= 0.5)]
g3 = data[(data["male"] == 0) & (data["female"] > 0.5) & (data["target"] > 0.5)]
g4 = data[(data["male"] == 0) & (data["female"] > 0.5) & (data["target"] <= 0.5)]
g5 = data[(data["male"] == 0) & (data["female"] == 0) & (data["target"] > 0.5)]
g6 = data[(data["male"] == 0) & (data["female"] == 0) & (data["target"] <= 0.5)]

# sample 500 rows from each group (random_state for reproducibility)
g1_sample = g1.sample(n=500, random_state=42)
g2_sample = g2.sample(n=500, random_state=42)
g3_sample = g3.sample(n=500, random_state=42)
g4_sample = g4.sample(n=500, random_state=42)
g5_sample = g5.sample(n=500, random_state=42)
g6_sample = g6.sample(n=500, random_state=42)

# combine into final dataset
final_sample = pd.concat([g1_sample, g2_sample, g3_sample, g4_sample, g5_sample, g6_sample])

# reset index
final_sample = final_sample.reset_index(drop=True)

In [26]:
len(final_sample)

3000

In [29]:
final_sample.sample(5)

Unnamed: 0,comment,target,male,female
1560,"But, but, but......keeping abortions unavailab...",0.2,0.0,1.0
2613,Pg 1 What we have here as Prime Minister is on...,0.4,0.0,0.0
453,"Right. A black man takes a knee and ""YOU'RE FI...",0.6,0.833333,0.0
655,"""those of you who have never felt unsafe while...",0.0,1.0,0.0
226,I know what the solution is. Why don't we just...,0.964806,1.0,0.0


In [30]:
final_sample.to_csv("subset.csv", index=False)