In [None]:
%cd ..

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import random
import re

In [3]:
train_df = pd.read_csv("./data/train_df.csv")
identities = [ident[0] for ident in pd.read_csv("./data/train_identities.txt", header=None).values.tolist()]

In [4]:
train_df.columns

Index(['Unnamed: 0', 'id', 'comment_text', 'toxic', 'gay', 'lgbt', 'lgbtq',
       'homosexual', 'heterosexual', 'male', 'female', 'nonbinary', 'african',
       'european', 'hispanic', 'latino', 'latina', 'latinx', 'mexican',
       'american', 'asian', 'indian', 'chinese', 'muslim', 'buddhist',
       'catholic', 'protestant', 'sikh', 'taoist', 'old', 'young', 'younger',
       'teenage', 'millenial', 'elderly', 'blind', 'deaf', 'paralyzed',
       'lesbian'],
      dtype='object')

In [5]:
index = []
toxic = []
comment_text = []
index_count = 0
a = []

for row_index in tqdm(range(len(train_df))):
    comment_words = train_df.iloc[row_index]['comment_text'].split()
    if len(set(identities).intersection(comment_words)) != 0: #Does contain identity
        index.append(index_count)
        index_count += 1
        ident = str(set(identities).intersection(comment_words).pop())
        #Append perturbed sentences to a (not including original sentence)
        a.append([re.sub(r'\b' + re.escape(ident) + r'\b', replace_ident, train_df.at[row_index, "comment_text"], flags=re.IGNORECASE) for replace_ident in identities if replace_ident != ident])
    else:
        index.append(-1)
    toxic.append(round(train_df.iloc[row_index]["toxic"]))
    comment_text.append(train_df.iloc[row_index]["comment_text"])
    
    

100%|██████████| 159571/159571 [00:19<00:00, 8229.61it/s]


In [6]:
data_tuples = list(zip(comment_text, toxic, index))

In [7]:
CLP_df1 = pd.DataFrame(data_tuples, columns=["comment_text", "toxic", "index"])

In [8]:
CLP_df1[:40]

Unnamed: 0,comment_text,toxic,index
0,Explanation\nWhy the edits made under my usern...,0,-1
1,D'aww! He matches this background colour I'm s...,0,-1
2,"Hey man, I'm really not trying to edit war. It...",0,-1
3,"""\nMore\nI can't make any real suggestions on ...",0,-1
4,"You, sir, are my hero. Any chance you remember...",0,-1
5,"""\n\nCongratulations from me as well, use the ...",0,-1
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,-1
7,Your vandalism to the Matt Shirvington article...,0,-1
8,Sorry if the word 'nonsense' was offensive to ...,0,-1
9,alignment on this subject and which are contra...,0,-1


In [9]:
indices = []
orig_text = []
aug_text = []

for row_index in tqdm(range(len(CLP_df1))):
    if CLP_df1.iloc[row_index]['index'] != -1:
        indices.append(CLP_df1.iloc[row_index]['index'])
        orig_text.append(CLP_df1.iloc[row_index]['comment_text'])
        comment_words = CLP_df1.iloc[row_index]['comment_text'].split()
        ident = str(set(identities).intersection(comment_words).pop())
        aug_text.append([re.sub(r'\b' + re.escape(ident) + r'\b', replace_ident, CLP_df1.at[row_index, "comment_text"], flags=re.IGNORECASE) for replace_ident in identities if replace_ident != ident])

100%|██████████| 159571/159571 [00:05<00:00, 26818.79it/s]


In [10]:
data_tuples = list(zip(indices, orig_text))
CLP_df2 = pd.DataFrame(data_tuples, columns=["index", "orig_text"])

In [11]:
aug_text_df_raw = pd.DataFrame(list(zip(*aug_text)))
aug_text_df = aug_text_df_raw.T
CLP_df3 = pd.concat([CLP_df2, aug_text_df], axis = 1)


In [12]:
CLP_df3[:10]

Unnamed: 0,index,orig_text,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,"""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...",...,"""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...","""\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant..."
1,1,"I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...",...,"I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ...","I went there around the same time he did, and ..."
2,2,"Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...",...,"Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo...","Ahh, Hello Witzeman \n\n203.92.84.161 \nSymbo..."
3,3,"""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...",...,"""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti...","""== Attributing and classifying of personaliti..."
4,4,"Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...",...,"Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t...","Hi \n\nHi, good day.\n\nMy deepest apologies t..."
5,5,"REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...",...,"REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin...","REPLY ABOVE:\nThat was me, loser. The UN defin..."
6,6,Appearance in historical fiction \n\nThe young...,Appearance in historical fiction \n\nThe bisex...,Appearance in historical fiction \n\nThe trans...,Appearance in historical fiction \n\nThe trans...,Appearance in historical fiction \n\nThe queer...,Appearance in historical fiction \n\nThe lgbt ...,Appearance in historical fiction \n\nThe lgbtq...,Appearance in historical fiction \n\nThe heter...,Appearance in historical fiction \n\nThe male ...,...,Appearance in historical fiction \n\nThe catho...,Appearance in historical fiction \n\nThe sikh ...,Appearance in historical fiction \n\nThe taois...,Appearance in historical fiction \n\nThe older...,Appearance in historical fiction \n\nThe young...,Appearance in historical fiction \n\nThe teena...,Appearance in historical fiction \n\nThe mille...,Appearance in historical fiction \n\nThe elder...,Appearance in historical fiction \n\nThe blind...,Appearance in historical fiction \n\nThe deaf ...
7,7,"""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...",...,"""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ...","""\n\n66.185.85.80's rant\n\nPresently, we are ..."
8,8,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...,That's a rather feeble premise to argue showin...
9,9,"""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...",...,"""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's...","""\n\nQuoting Hoppe first; Detailing Kinsella's..."


In [13]:
len(CLP_df1)

159571

In [14]:
df = pd.read_csv('./data/jigsaw/train_with_idents.csv')

In [16]:
df['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [17]:
CLP_df1['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [21]:
CLP_df1.columns

Index(['comment_text', 'toxic', 'index'], dtype='object')

In [26]:
df['index'] = CLP_df1['index']

In [27]:
df.head()

Unnamed: 0,id,comment_text,toxic,gay,bisexual,transgender,trans,queer,lgbt,lgbtq,...,younger,teenage,millenial,middle aged,elderly,blind,deaf,paralyzed,lesbian,index
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1


In [28]:
df.to_csv('./data/jigsaw/train_with_idents.csv', index=False)

In [25]:
CLP_df3.to_csv('./data/jigsaw/train_adversarials.csv', index=False)