In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import random
import re

In [3]:
train_df = pd.read_csv("./data/train_df.csv")
identities = [ident[0] for ident in pd.read_csv("./data/adjectives_people.txt", header=None).values.tolist()]

In [4]:
train_df.columns

Index(['Unnamed: 0', 'id', 'comment_text', 'toxic', 'gay', 'bisexual',
       'transgender', 'trans', 'queer', 'lgbt', 'lgbtq', 'homosexual',
       'straight', 'heterosexual', 'male', 'female', 'nonbinary', 'african',
       'african american', 'black', 'white', 'european', 'hispanic', 'latino',
       'latina', 'latinx', 'mexican', 'canadian', 'american', 'asian',
       'indian', 'middle eastern', 'chinese', 'japanese', 'christian',
       'muslim', 'jewish', 'buddhist', 'catholic', 'protestant', 'sikh',
       'taoist', 'old', 'older', 'young', 'younger', 'teenage', 'millenial',
       'middle aged', 'elderly', 'blind', 'deaf', 'paralyzed', 'lesbian'],
      dtype='object')

In [9]:
index = []
toxic = []
comment_text = []
index_count = 0
a = []

for row_index in tqdm(range(len(train_df))):
    comment_words = train_df.iloc[row_index]['comment_text'].split()
    if len(set(identities).intersection(comment_words)) != 0: #Does contain identity
        index.append(index_count)
        index_count += 1
        ident = str(set(identities).intersection(comment_words).pop())
        #Append perturbed sentences to a (not including original sentence)
        a.append([re.sub(r'\b' + re.escape(ident) + r'\b', replace_ident, train_df.at[row_index, "comment_text"], flags=re.IGNORECASE) for replace_ident in identities if replace_ident != ident])
    else:
        index.append(-1)
    toxic.append(round(train_df.iloc[row_index]["toxic"]))
    comment_text.append(train_df.iloc[row_index]["comment_text"])
    
    

100%|██████████| 159571/159571 [00:58<00:00, 2723.38it/s]


In [10]:
data_tuples = list(zip(comment_text, toxic, index))

In [11]:
CLP_df1 = pd.DataFrame(data_tuples, columns=["comment_text", "toxic", "index"])

In [17]:
CLP_df1[:40]

Unnamed: 0,comment_text,toxic,index
0,Explanation\nWhy the edits made under my usern...,0,-1
1,D'aww! He matches this background colour I'm s...,0,-1
2,"Hey man, I'm really not trying to edit war. It...",0,-1
3,"""\nMore\nI can't make any real suggestions on ...",0,-1
4,"You, sir, are my hero. Any chance you remember...",0,-1
5,"""\n\nCongratulations from me as well, use the ...",0,-1
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,-1
7,Your vandalism to the Matt Shirvington article...,0,-1
8,Sorry if the word 'nonsense' was offensive to ...,0,-1
9,alignment on this subject and which are contra...,0,-1


In [21]:
indices = []
orig_text = []
aug_text = []

for row_index in tqdm(range(len(CLP_df1))):
    if CLP_df1.iloc[row_index]['index'] != -1:
        indices.append(CLP_df1.iloc[row_index]['index'])
        orig_text.append(CLP_df1.iloc[row_index]['comment_text'])
        comment_words = CLP_df1.iloc[row_index]['comment_text'].split()
        ident = str(set(identities).intersection(comment_words).pop())
        aug_text.append([re.sub(r'\b' + re.escape(ident) + r'\b', replace_ident, CLP_df1.at[row_index, "comment_text"], flags=re.IGNORECASE) for replace_ident in identities if replace_ident != ident])

100%|██████████| 159571/159571 [00:22<00:00, 7170.67it/s]


In [29]:
data_tuples = list(zip(indices, orig_text))
CLP_df2 = pd.DataFrame(data_tuples, columns=["index", "orig_text"])

In [30]:
aug_text_df_raw = pd.DataFrame(list(zip(*aug_text)))
aug_text_df = aug_text_df_raw.T
CLP_df3 = pd.concat([CLP_df2, aug_text_df], axis = 1)


In [38]:
CLP_df3[:10]

Unnamed: 0,index,orig_text,0,1,2,3,4,5,6,7,...,39,40,41,42,43,44,45,46,47,48
0,0,"""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...",...,"""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant..."
1,1,You are gay or antisemmitian? \n\nArchangel WH...,You are lesbian or antisemmitian? \n\nArchange...,You are bisexual or antisemmitian? \n\nArchang...,You are transgender or antisemmitian? \n\nArch...,You are trans or antisemmitian? \n\nArchangel ...,You are queer or antisemmitian? \n\nArchangel ...,You are lgbt or antisemmitian? \n\nArchangel W...,You are lgbtq or antisemmitian? \n\nArchangel ...,You are homosexual or antisemmitian? \n\nArcha...,...,You are older or antisemmitian? \n\nArchangel ...,You are young or antisemmitian? \n\nArchangel ...,You are younger or antisemmitian? \n\nArchange...,You are teenage or antisemmitian? \n\nArchange...,You are millenial or antisemmitian? \n\nArchan...,You are middle aged or antisemmitian? \n\nArch...,You are elderly or antisemmitian? \n\nArchange...,You are blind or antisemmitian? \n\nArchangel ...,You are deaf or antisemmitian? \n\nArchangel W...,You are paralyzed or antisemmitian? \n\nArchan...
2,2,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...,My Band Page's deletion. You thought I was gon...
3,3,"A Bisexual, like a homosexual or a heterosexua...","A Bisexual, like a lesbian or a heterosexual, ...","A Bisexual, like a gay or a heterosexual, is n...","A Bisexual, like a bisexual or a heterosexual,...","A Bisexual, like a transgender or a heterosexu...","A Bisexual, like a trans or a heterosexual, is...","A Bisexual, like a queer or a heterosexual, is...","A Bisexual, like a lgbt or a heterosexual, is ...","A Bisexual, like a lgbtq or a heterosexual, is...",...,"A Bisexual, like a older or a heterosexual, is...","A Bisexual, like a young or a heterosexual, is...","A Bisexual, like a younger or a heterosexual, ...","A Bisexual, like a teenage or a heterosexual, ...","A Bisexual, like a millenial or a heterosexual...","A Bisexual, like a middle aged or a heterosexu...","A Bisexual, like a elderly or a heterosexual, ...","A Bisexual, like a blind or a heterosexual, is...","A Bisexual, like a deaf or a heterosexual, is ...","A Bisexual, like a paralyzed or a heterosexual..."
4,4,"I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...",...,"I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ..."
5,5,"Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...",...,"Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo..."
6,6,"""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...",...,"""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti..."
7,7,"Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...",...,"Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t..."
8,8,"REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...",...,"REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin..."
9,9,u r a tw@ fuck off u gay boy.U r smelly.Fuck u...,u r a tw@ fuck off u lesbian boy.U r smelly.Fu...,u r a tw@ fuck off u bisexual boy.U r smelly.F...,u r a tw@ fuck off u transgender boy.U r smell...,u r a tw@ fuck off u trans boy.U r smelly.Fuck...,u r a tw@ fuck off u queer boy.U r smelly.Fuck...,u r a tw@ fuck off u lgbt boy.U r smelly.Fuck ...,u r a tw@ fuck off u lgbtq boy.U r smelly.Fuck...,u r a tw@ fuck off u homosexual boy.U r smelly...,...,u r a tw@ fuck off u older boy.U r smelly.Fuck...,u r a tw@ fuck off u young boy.U r smelly.Fuck...,u r a tw@ fuck off u younger boy.U r smelly.Fu...,u r a tw@ fuck off u teenage boy.U r smelly.Fu...,u r a tw@ fuck off u millenial boy.U r smelly....,u r a tw@ fuck off u middle aged boy.U r smell...,u r a tw@ fuck off u elderly boy.U r smelly.Fu...,u r a tw@ fuck off u blind boy.U r smelly.Fuck...,u r a tw@ fuck off u deaf boy.U r smelly.Fuck ...,u r a tw@ fuck off u paralyzed boy.U r smelly....
