In [1]:
import pandas as pd
from pathlib import Path
from datasets import load_from_disk

In [2]:
df = pd.read_csv(Path("../data/raw/jigsaw-unintended-bias-in-toxicity-classification/") / "all_data.csv")

# rename columns
df = df.rename(columns={"comment_text": "text"})

# select columns
cols = ["split", "id", "text", "labels", "toxicity"]
df = df[cols].copy()

# drop null values
df = df.dropna(subset=["text", "labels"])

1999516
1999515


In [4]:
# compute unique texts exact
df["unique_id"] = df.groupby("text").ngroup().astype(int)

# compute average toxicity across equal texts
df["avg_toxicity"] = df.groupby("unique_id")["toxicity"].transform("mean")

# remove duplicates within split
df = df.drop_duplicates(subset=["split", "text"])

# remove duplicates across splits
ddf = df.sort_values(["unique_id", "split"]).drop_duplicates(subset=["unique_id"])

# check that duplicates are removed from training rather than testing
assert df["split"].value_counts()["test"] == ddf["split"].value_counts()["test"]

# select columns
ddf = ddf.drop(columns=["toxicity", "unique_id"])

# binarize labels
ddf["labels"] = (df["avg_toxicity"] >= 0.5).astype(int)


Unnamed: 0,split,id,text,labels,toxicity,unique_id,avg_toxicity
452728,train,5705108,"Canada is north of the USA border, its colde...",0,0.0,0,0.0
1578639,train,6123823,"Gary, well said. Somehow many people think K...",1,0.5,1,0.5
126551,train,6277865,"Um, it's not their definition of ""terrorist""....",0,0.0,2,0.0
1744767,train,818414,"! save it commy,, china is evil, repressive an...",1,0.5,3,0.5
287987,train,5244549,! think there are medications for bladder prob...,0,0.0,4,0.0
...,...,...,...,...,...,...,...
1716189,train,765333,"🤡🤡 the MQ & MA are a couple clowns 😂,,, I supp...",0,0.4,1971910,0.4
764735,train,807680,🤢,0,0.0,1971911,0.0
1958914,train,5905987,🤣,0,0.1,1971912,0.1
281657,train,6097312,🤣......... another round of huge duties looks ...,0,0.0,1971913,0.0


In [None]:
ds_dict = load_from_disk("../data/processed/civil_comments/")

In [None]:
train_df = ds_dict["train"].to_pandas()
test_df = ds_dict["test"].to_pandas()


In [None]:
train_df.drop_duplicates(subset=["text"])

In [None]:
df = pd.concat([test_df.assign(split="test"), train_df.assign(split="train")])

In [None]:
pd.merge(test_df, train_df, on="text", how="inner")

In [20]:
a = ddf.groupby("text").agg({"split": ["nunique", "count", "size"], "labels": ["nunique"]})

Unnamed: 0_level_0,split,split,split,labels
Unnamed: 0_level_1,nunique,count,size,nunique
text,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2


In [None]:
a.loc[a[('labels', 'nunique')] > 1]

In [None]:
df.loc[df["text"] == "you have all the character and credibility of a skid mark."]

In [None]:
data_path = Path("../data/raw/jigsaw-unintended-bias-in-toxicity-classification")

In [None]:
test = pd.read_csv(data_path / "test.csv")
test_private_expanded = pd.read_csv(data_path / "test_private_expanded.csv")
test_public_expanded = pd.read_csv(data_path / "test_public_expanded.csv")

In [None]:
test.shape

In [None]:
test_private_expanded.shape

In [None]:
test_public_expanded.shape

In [None]:
pd.merge(test, test_public_expanded, on="id", how="inner")

In [None]:
test_private_expanded.columns

In [None]:
test_private_expanded["obscene"]

In [None]:
test_public_expanded.columns

In [None]:
df = pd.read_csv(data_path / "all_data.csv")

In [None]:
df["split"].value_counts()

In [None]:
cols = [

    "toxicity",
    "severe_toxicity",
    "obscene",
    "threat",
    "insult",
    "identity_attack",
    "sexual_explicit",
]

In [None]:
df.loc[
    (df["toxicity"] < 0.5)
    & (df["severe_toxicity"] >= 0.5)
]

In [None]:
df["target"] = (df["toxicity"] >= 0.5).astype(int)

In [None]:
cols = ["id", "comment_text", "split", "target"]
df = df[cols]

In [None]:
df[cols].sample(10).values

In [None]:
df["split"].value_counts()

In [None]:
df["target"]