# Loading in Dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
training_csv = pd.read_csv("SBIC.v2.agg.trn.csv", index_col=0)
dev_csv = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)

### Manipulating the CSV file so that we only use the columns specified by Maarten

In [4]:
training_csv = training_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
training_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",[],[],0.0,1
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,[],[],,1
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,"[""liberals""]","[""miscontrue things"", ""take things in a weird ...",1.0,0
3,\nDavid Knight‏ \n\nNotice how quickly things ...,[],[],0.0,1
4,\nFinland fireball: Time-lapse video shows nig...,[],[],,1
...,...,...,...,...,...
35498,👉 Illegally in the country after 5 deportation...,[],[],,1
35500,💥Breaking💥\nJulian Assange is the gate keeper ...,[],[],0.0,1
35501,📖 2Kings 22:19 because your heart was peniten...,[],[],,1
35502,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,[],[],0.0,1


In [5]:
dev_csv = dev_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
dev_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
dev_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,[],[],,1
1,"they found marijuana, butane hash oil (BHO) a...",[],[],0.0,1
2,""""""" I have no problem being called Benny! Just...",[],[],0.0,1
3,""""""" animal friend in My ass"""" You need some He...",[],[],0.0,1
4,"""""""A mans belly shall be satisfied with the fr...",[],[],0.0,1
...,...,...,...,...,...
4668,“One special advantage of the skeptical attitu...,[],[],0.0,1
4669,“The only thing worse than living in a country...,"[""black folks, asian folks, latino/latina folk...","[""non-white people ruin the life of white peop...",1.0,0
4670,“Two things are infinite: the universe and hum...,[],[],0.0,1
4671,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]","[""all jews have big noses"", ""died in the holoc...",1.0,0


### Putting data in the correct format to work with

In [6]:
training_csv["offensiveYN"] = training_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [7]:
dev_csv["offensiveYN"] = dev_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [8]:
def createNewLine(i, csv, group):
    copy_df = csv.copy(deep=True)
    copy_df.append(copy_df.loc[[i]])
    copy_df.loc[[len(csv)-1]]["implications"] = group[len(group)-1]
    del group[-1]
    copy_df.loc[i,"implications"] = str(group)
    return copy_df

In [9]:
import ast
#one implication per line
# if > 1 implication, copy entire line, put on end, change group
list_implications = []
list_group = []
for i in training_csv.index:
    list_val = ast.literal_eval(training_csv.loc[[i]]["implications"].values[0])
    if (len(list_val) > 1):
        while (len(list_val) > 1):
            training_csv = createNewLine(i, training_csv, list_val)
            list_val = ast.literal_eval(training_csv.loc[[i]]["implications"].values[0])
    list_group.append(",".join(map(str, ast.literal_eval(training_csv.loc[[i]]["group"].values[0]))))

In [10]:
#training_csv["implications"] = list_implications
training_csv["group"] = list_group

In [11]:
training_csv
#do final pass over the implications and get rid of list format
for i in training_csv.index:
    list_implications.append(",".join(map(str, ast.literal_eval(training_csv.loc[[i]]["implications"].values[0]))))

In [12]:
training_csv["implications"] = list_implications
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",,,0.0,[OffN]
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,,,,[OffN]
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,liberals,miscontrue things,1.0,[OffY]
3,\nDavid Knight‏ \n\nNotice how quickly things ...,,,0.0,[OffN]
4,\nFinland fireball: Time-lapse video shows nig...,,,,[OffN]
...,...,...,...,...,...
35498,👉 Illegally in the country after 5 deportation...,,,,[OffN]
35500,💥Breaking💥\nJulian Assange is the gate keeper ...,,,0.0,[OffN]
35501,📖 2Kings 22:19 because your heart was peniten...,,,,[OffN]
35502,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,,,0.0,[OffN]


In [13]:
import ast
#one implication per line
# if > 1 implication, copy entire line, put on end, change group
list_implications = []
list_group = []
for i in dev_csv.index:
    list_val = ast.literal_eval(dev_csv.loc[[i]]["implications"].values[0])
    if (len(list_val) > 1):
        while (len(list_val) > 1):
            dev_csv = createNewLine(i, dev_csv, list_val)
            list_val = ast.literal_eval(dev_csv.loc[[i]]["implications"].values[0])
    list_group.append(",".join(map(str, ast.literal_eval(dev_csv.loc[[i]]["group"].values[0]))))

In [14]:
dev_csv["group"] = list_group

In [15]:
dev_csv
#do final pass over the implications and get rid of list format
for i in dev_csv.index:
    list_implications.append(",".join(map(str, ast.literal_eval(dev_csv.loc[[i]]["implications"].values[0]))))

In [16]:
dev_csv["implications"] = list_implications
dev_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,,,,[OffN]
1,"they found marijuana, butane hash oil (BHO) a...",,,0.0,[OffN]
2,""""""" I have no problem being called Benny! Just...",,,0.0,[OffN]
3,""""""" animal friend in My ass"""" You need some He...",,,0.0,[OffN]
4,"""""""A mans belly shall be satisfied with the fr...",,,0.0,[OffN]
...,...,...,...,...,...
4668,“One special advantage of the skeptical attitu...,,,0.0,[OffN]
4669,“The only thing worse than living in a country...,"black folks, asian folks, latino/latina folks,...",non-white people ruin the life of white people.,1.0,[OffY]
4670,“Two things are infinite: the universe and hum...,,,0.0,[OffN]
4671,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,all jews have big noses,1.0,[OffY]


# Regular Dataset
- [boi] post [eoi] [boo] [offY] [grp/ind] group [ste] stereotype [eoo]
- 50/50 split of Offy and Offn

In [17]:
offY_df = training_csv.where(training_csv["offensiveYN"] == "[OffY]").dropna(subset=['offensiveYN'])
num_offY = len(offY_df.index)
offN_df = training_csv.where(training_csv["offensiveYN"] == "[OffN]").dropna(subset=['offensiveYN']).sample(n=num_offY).reset_index()
fifty_df = pd.concat([offY_df, offN_df])
# Shuffle the indices
fifty_df = fifty_df.sample(frac=1).reset_index(drop=True)

In [18]:
fifty_df

Unnamed: 0,post,group,implications,whoTarget,offensiveYN,index
0,What is the difference between a Tree and a Bl...,black folks,black people are useless,1.0,[OffY],
1,"What’s pink, six inches long, and make my daug...",women,forcing an abortion,1.0,[OffY],
2,"If Normies really wanted to help us, they will...",,,0.0,[OffN],13820.0
3,Who are the worst type of women Jewish because...,"jewish folks,women",jewish people are bad,1.0,[OffY],
4,when your not so little cousins text you for l...,,,,[OffN],34877.0
...,...,...,...,...,...,...
24247,What do you call a porn actress with diarrhea ...,porn stars,porn stars are dumb,1.0,[OffY],
24248,Why did Princess Diana cross the road? Because...,,,0.0,[OffN],28633.0
24249,Glad it was false because where would we get o...,,,,[OffN],7863.0
24250,"Nowadays, I have sex every single day Just one...",,,0.0,[OffN],18251.0


In [19]:
training_csv = fifty_df

In [20]:
final_input = []
final_output = []
for i in training_csv.index:
    final_input.append("[boi] " + str(training_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [21]:
regular_df = pd.DataFrame()

In [22]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [23]:
regular_df

Unnamed: 0,input,output
0,[boi] What is the difference between a Tree an...,[boo] [OffY] [grp] black folks [ste] black peo...
1,"[boi] What’s pink, six inches long, and make m...",[boo] [OffY] [grp] women [ste] forcing an abor...
2,"[boi] If Normies really wanted to help us, the...",[boo] [OffN] [eoo]
3,[boi] Who are the worst type of women Jewish b...,"[boo] [OffY] [grp] jewish folks,women [ste] je..."
4,[boi] when your not so little cousins text you...,[boo] [OffN] [eoo]
...,...,...
24247,[boi] What do you call a porn actress with dia...,[boo] [OffY] [grp] porn stars [ste] porn stars...
24248,[boi] Why did Princess Diana cross the road? B...,[boo] [OffN] [eoo]
24249,[boi] Glad it was false because where would we...,[boo] [OffN] [eoo]
24250,"[boi] Nowadays, I have sex every single day Ju...",[boo] [OffN] [eoo]


In [24]:
regular_df.to_csv("SBIC.trn.3.5050.csv", index=False)

In [25]:
offY_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffY]").dropna(subset=['offensiveYN'])
num_offY = len(offY_df.index)
offN_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffN]").dropna(subset=['offensiveYN']).sample(n=num_offY).reset_index()
fifty_df = pd.concat([offY_df, offN_df])
# Shuffle the indices
fifty_df = fifty_df.sample(frac=1).reset_index(drop=True)

In [26]:
fifty_df

Unnamed: 0,post,group,implications,whoTarget,offensiveYN,index
0,You can't tell me that african child is starvi...,"Africans,black folks",african kids are starving.,1.0,[OffY],
1,Why do you never see a black person with Down ...,"black folks,folks with physical illness/disord...",are punished by god,1.0,[OffY],
2,Jews are like water They're both solid when th...,"Holocaust Victims,holocaust victims,jewish folks",jewish folks are objects,1.0,[OffY],
3,@BostonJerry @Elle__Chapo they think we don't ...,,,,[OffN],343.0
4,If you blow hard enough on an old Nintendo car...,,,0.0,[OffN],1794.0
...,...,...,...,...,...,...
3607,Black people are some of the best people I’ve ...,,,0.0,[OffN],622.0
3608,What's the difference between an oven and a ba...,"assault victims,kids",belittles the severe pain that rape victims feel,1.0,[OffY],
3609,Cause it's from 7 years ago? Fucking unreal ht...,,,,[OffN],683.0
3610,"I'm so happy I met you, I couldn't imagine lif...",,,0.0,[OffN],1720.0


In [27]:
dev_csv = fifty_df

In [28]:
final_input = []
final_output = []
for i in dev_csv.index:
    final_input.append("[boi] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [29]:
regular_df = pd.DataFrame()

In [30]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [31]:
regular_df

Unnamed: 0,input,output
0,[boi] You can't tell me that african child is ...,"[boo] [OffY] [grp] Africans,black folks [ste] ..."
1,[boi] Why do you never see a black person with...,"[boo] [OffY] [grp] black folks,folks with phys..."
2,[boi] Jews are like water They're both solid w...,"[boo] [OffY] [grp] Holocaust Victims,holocaust..."
3,[boi] @BostonJerry @Elle__Chapo they think we ...,[boo] [OffN] [eoo]
4,[boi] If you blow hard enough on an old Ninten...,[boo] [OffN] [eoo]
...,...,...
3607,[boi] Black people are some of the best people...,[boo] [OffN] [eoo]
3608,[boi] What's the difference between an oven an...,"[boo] [OffY] [grp] assault victims,kids [ste] ..."
3609,[boi] Cause it's from 7 years ago? Fucking unr...,[boo] [OffN] [eoo]
3610,"[boi] I'm so happy I met you, I couldn't imagi...",[boo] [OffN] [eoo]


In [32]:
regular_df.to_csv("SBIC.dev.3.5050.csv", index=False)

# Reading in 50/50 data

In [87]:
preds = pd.read_csv("predictions_2_1000.csv", index_col=0)

In [88]:
preds["Source Text"] = preds["Source Text"].apply(lambda x: x.replace("[boi] ", "")).apply(lambda x: x.replace(" [eoi]", ""))

In [90]:
preds["Generated Text"] = preds["Generated Text"].apply(lambda x: x.replace("<pad> ", "")).apply(lambda x: x.replace(" <pad>", ""))
preds.columns = ["post", "output"]

In [28]:
from sklearn.metrics import f1_score
import re
list_of_data = []
# split on parenthesis
for i in range(0, len(generated_predictions)):
    list_of_data.append(list(filter(lambda x: x != "", [sentence.strip() for sentence in
                                                generated_predictions[i].replace("<pad>", '').replace("[", '').replace("boo",'')
                                                .replace("grp", '').replace("ste", '').replace("eoo", '')
                                                .split(']')])))

In [91]:
y_test = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)[:1000]
y_test

Unnamed: 0,post,targetMinority,targetCategory,targetStereotype,whoTarget,intentYN,sexYN,offensiveYN,dataSource,hasBiasedImplication
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,[],[],[],,0.110000,0.000000,0.000000,Gab,1
1,"they found marijuana, butane hash oil (BHO) a...",[],[],[],0.0,0.110000,0.000000,0.166667,Gab,1
2,""""""" I have no problem being called Benny! Just...",[],[],[],0.0,0.553333,0.000000,0.500000,t/founta,1
3,""""""" animal friend in My ass"""" You need some He...",[],[],[],0.0,0.886667,0.166667,1.000000,t/founta,1
4,"""""""A mans belly shall be satisfied with the fr...",[],[],[],0.0,0.000000,0.000000,0.000000,t/founta,1
...,...,...,...,...,...,...,...,...,...,...
996,"Guy next to me on plane:: So, where are you fr...","[""asian folks""]","[""race""]","[""asians all look alike"", ""asians are quiet"", ...",1.0,0.830000,0.000000,1.000000,redditMicroagressions,0
997,"Guy wants to fuck a girl, not learn basic carp...",[],[],[],0.0,0.330000,1.000000,0.500000,redditMicroagressions,1
998,Guys! I am not Justin Bieber! Stop sending me ...,[],[],[],,0.000000,0.000000,0.000000,t/founta,1
999,HAHA @ThaRodickk Im not sexist....but I honest...,"[""men"", ""women""]","[""gender""]","[""men make good broadcasters"", ""men only care ...",1.0,1.000000,0.000000,1.000000,t/waseem,0


In [92]:
merged_df = pd.merge(preds, y_test, on="post")
merged_df["offensiveYN"] = merged_df["offensiveYN"].apply(lambda label: 1 if label == 1 else 0)

In [93]:
merged_df["offYNPredictions"] = merged_df["output"].apply(lambda label: 1 if "OffY" in label else 0)

In [110]:
subset_merged_df = merged_df[["output", "targetMinority", "targetCategory", "targetStereotype", "offensiveYN"]]
subset_merged_df[merged_df.output.str.contains('OffY')].head(50)

Unnamed: 0,output,targetMinority,targetCategory,targetStereotype,offensiveYN
2,[boo] [OffY] [grp] women [ste] women are bitches [eoo],[],[],[],0
19,[boo] [OffY] [grp] mentally disabled folks [ste] mentally disabled folks are retarded [eoo],"[""folks with mental illness/disorder, mentally disabled folks"", ""mentally disabled folks""]","[""disabled""]","[""being retarded is bad"", ""makes light of the word retarded"", ""mentally disabled folks are retards."", ""mentally disabled folks should have their disability used as a punchline.""]",1
27,[boo] [OffY] [grp] homeless [ste] homeless people are worthless [eoo],[],[],[],0
41,[boo] [OffY] [grp] liberals [ste] liberals are stupid [eoo],"[""liberals""]","[""social""]","[""are bad"", ""are corrupt""]",0
43,"[boo] [OffY] [grp] homos,gay men [ste] homos are a derogatory term [eoo]","[""gay men""]","[""gender""]","[""calls the homo"", ""he accuses of sexual assault""]",1
44,[boo] [OffY] [grp] asian folks [ste] asian people are a burden on society [eoo],[],[],[],0
53,[boo] [OffY] [grp] women [ste] women are hoes [eoo],"[""women""]","[""gender""]","[""women are easy to manipulate"", ""women are hoes""]",1
54,[boo] [OffY] [grp] women [ste] women are bitches [eoo],[],[],[],0
57,[boo] [OffY] [grp] women [ste] women are bitches [eoo],"[""women""]","[""gender""]","[""women are gold diggers"", ""women are interested in a man's money"", ""women are only after men's money"", ""women only care about material objects""]",1
61,[boo] [OffY] [grp] black folks [ste] black people are niggers [eoo],"[""black folks""]","[""race""]","[""black folks are all very dark skinned"", ""black folks are n*ggers"", ""black folks are niggers"", ""black folks can be mocked"", ""blacks are called the n word"", ""post degrates black folks""]",1


# Scrambled Dataset
- need to overrepresent offensive samples and have more structure with the tokens so the model learns
- create each kind of scramble for each input
#### Most important:
1) generate stereotype given just post
2) generate stereotype given post, group, and offensiveness
3) generate post given group, sterotype, and offensiveness

Making three kinds of scrambles:
1) regular, with new tokens
2) full scramble, with new tokens
3) important scramble, with new tokens

[bos] post [EOS] [offN] [eos] \
[offN] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [ind] [eos]
[offY] [ind] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [grp/ind] group [ste] stereotype [eos] \
[bos] post [offY] [grp/ind] group [CLS] [ste] stereotype [eos] \
[bos] post [offY] [CLS] [grp/ind] group [ste] stereotype [eos] 

[offY] [grp/ind] group [ste] stereotype [CLS] [bos] post [eos] \
[offY] [grp/ind] group [CLS] post [ste] stereotype [eos] \
[offY] [grp/ind] group [CLS] [ste] stereotype [bos] post [eos] \
[offY] [ste] stereotype [CLS] [bos] post [grp/ind] group [eos]

[offY] [CLS] [bos] post [ste] stereotype [grp/ind] group [eos] 


In [36]:
final_input = []
final_output = []
for i in training_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] [bos]" + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")         
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [37]:
full_scramble_df = pd.DataFrame()

In [38]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [39]:
full_scramble_df

Unnamed: 0,input,output
0,[boi] [OffY] [grp] black folks [ste] black peo...,[boo] [bos] What is the difference between a T...
1,[boi] [OffY] [grp] black folks [eoi],[boo] [bos] What is the difference between a T...
2,[boi] [OffY] [eoi],[boo] [bos]What is the difference between a Tr...
3,[boi] [bos] What is the difference between a T...,[boo] [OffY] [grp] black folks [ste] black peo...
4,[boi] [OffY] [grp] women [ste] forcing an abor...,"[boo] [bos] What’s pink, six inches long, and ..."
...,...,...
60605,"[boi] [bos] Nowadays, I have sex every single ...",[boo] [OffN] [eoo]
60606,[boi] [OffY] [grp] jewish folks [ste] are gass...,"[boo] [bos] Guys, I recommend Hitler's latest ..."
60607,[boi] [OffY] [grp] jewish folks [eoi],"[boo] [bos] Guys, I recommend Hitler's latest ..."
60608,[boi] [OffY] [eoi],"[boo] [bos]Guys, I recommend Hitler's latest c..."


In [40]:
full_scramble_df.to_csv("SBIC.trn.scramble.3.5050.csv", index=False)

In [41]:
final_input = []
final_output = []
for i in dev_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] [bos]" + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")         
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [42]:
full_scramble_df = pd.DataFrame()

In [43]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [44]:
full_scramble_df

Unnamed: 0,input,output
0,"[boi] [OffY] [grp] Africans,black folks [ste] ...",[boo] [bos] You can't tell me that african chi...
1,"[boi] [OffY] [grp] Africans,black folks [eoi]",[boo] [bos] You can't tell me that african chi...
2,[boi] [OffY] [eoi],[boo] [bos]You can't tell me that african chil...
3,[boi] [bos] You can't tell me that african chi...,"[boo] [OffY] [grp] Africans,black folks [ste] ..."
4,"[boi] [OffY] [grp] black folks,folks with phys...",[boo] [bos] Why do you never see a black perso...
...,...,...
9023,[boi] [OffY] [eoi],[boo] [bos]What's the difference between an ov...
9024,[boi] [bos] What's the difference between an o...,"[boo] [OffY] [grp] assault victims,kids [ste] ..."
9025,[boi] [bos] Cause it's from 7 years ago? Fucki...,[boo] [OffN] [eoo]
9026,"[boi] [bos] I'm so happy I met you, I couldn't...",[boo] [OffN] [eoo]


In [45]:
full_scramble_df.to_csv("SBIC.dev.scramble.3.5050.csv", index=False)