# Loading in Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
training_csv = pd.read_csv("SBIC.v2.agg.trn.csv", index_col=0)
dev_csv = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)

### Manipulating the CSV file so that we only use the columns specified by Maarten

In [3]:
training_csv = training_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
training_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",[],[],0.0,1
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,[],[],,1
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,"[""liberals""]","[""miscontrue things"", ""take things in a weird ...",1.0,0
3,\nDavid Knight‏ \n\nNotice how quickly things ...,[],[],0.0,1
4,\nFinland fireball: Time-lapse video shows nig...,[],[],,1
...,...,...,...,...,...
35498,👉 Illegally in the country after 5 deportation...,[],[],,1
35500,💥Breaking💥\nJulian Assange is the gate keeper ...,[],[],0.0,1
35501,📖 2Kings 22:19 because your heart was peniten...,[],[],,1
35502,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,[],[],0.0,1


In [4]:
dev_csv = dev_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
dev_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,[],[],,1
1,"they found marijuana, butane hash oil (BHO) a...",[],[],0.0,1
2,""""""" I have no problem being called Benny! Just...",[],[],0.0,1
3,""""""" animal friend in My ass"""" You need some He...",[],[],0.0,1
4,"""""""A mans belly shall be satisfied with the fr...",[],[],0.0,1
...,...,...,...,...,...
4661,“One special advantage of the skeptical attitu...,[],[],0.0,1
4662,“The only thing worse than living in a country...,"[""black folks, asian folks, latino/latina folk...","[""non-white people ruin the life of white peop...",1.0,0
4663,“Two things are infinite: the universe and hum...,[],[],0.0,1
4664,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]","[""all jews have big noses"", ""died in the holoc...",1.0,0


### Putting data in the correct format to work with

In [5]:
training_csv["offensiveYN"] = training_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [6]:
dev_csv["offensiveYN"] = dev_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [7]:
def literal_return(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return []

In [8]:
import ast
i = 0;
original_len = len(dev_csv)
while i < original_len:
    list_val = literal_return(dev_csv["implications"].tolist()[i])
    if (list_val != None and len(list_val) != 0):
        s=dev_csv.iloc[[i],] # pick the row you want to do repeat
        s=s.reindex(s.index.repeat(len(list_val)))# repeat the row by the giving number 
        s.loc[:,'implications']=list_val # change the value
        dev_csv=pd.concat([dev_csv,s]) #append to the original df 
        dev_csv.reset_index(drop=True, inplace=True)
    i = i + 1

#do final pass over the implications and get rid of list format
i = 0;
while i < original_len:
    if "[" in dev_csv["implications"].tolist()[i]:
        dev_csv.drop(index=i, axis=0, inplace=True)
        dev_csv.reset_index()
    i = i + 1

dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"Over 10,000 arrests for child trafficking sinc...",[],[],,[OffN]
1,PSA\nY'all should stay away from stupidslutscl...,[],[],0.0,[OffN]
2,PSA: Remember to share the Atomic Blackpill on...,[],[],0.0,[OffN]
3,Partly Cloudy today! With a high of 16C.and a ...,[],[],,[OffN]
4,Party bus going to LA live at this bar/bowling...,[],[],,[OffN]
...,...,...,...,...,...
10181,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",died in the holocaust.,1.0,[OffY]
10182,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",have a troubled past,1.0,[OffY]
10183,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",jews deserved to be burned,1.0,[OffY]
10184,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",making a joke about the holocaust.,1.0,[OffY]


In [9]:
list_group = []
for i in range(0, len(dev_csv)):
    list_group.append(",".join(map(str, literal_return(dev_csv.iloc[[i]]["group"].values[0]))))
    #list_group.append(",".join(map(str, literal_return(dev_csv.iloc[[i]]["group"]) if not None)))
    
dev_csv["group"] = list_group
dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"Over 10,000 arrests for child trafficking sinc...",,[],,[OffN]
1,PSA\nY'all should stay away from stupidslutscl...,,[],0.0,[OffN]
2,PSA: Remember to share the Atomic Blackpill on...,,[],0.0,[OffN]
3,Partly Cloudy today! With a high of 16C.and a ...,,[],,[OffN]
4,Party bus going to LA live at this bar/bowling...,,[],,[OffN]
...,...,...,...,...,...
10181,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,died in the holocaust.,1.0,[OffY]
10182,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,have a troubled past,1.0,[OffY]
10183,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,jews deserved to be burned,1.0,[OffY]
10184,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,making a joke about the holocaust.,1.0,[OffY]


In [10]:
import ast
i = 0;
original_len = len(training_csv)
while i < original_len:
    list_val = literal_return(training_csv["implications"].tolist()[i])
    if (list_val != None and len(list_val) != 0):
        s=training_csv.iloc[[i],] # pick the row you want to do repeat
        s=s.reindex(s.index.repeat(len(list_val)))# repeat the row by the giving number 
        s.loc[:,'implications']=list_val # change the value
        training_csv=pd.concat([training_csv,s]) #append to the original df 
        training_csv.reset_index(drop=True, inplace=True)
    i = i + 1

#do final pass over the implications and get rid of list format
i = 0;
while i < original_len:
    if "[" in training_csv["implications"].tolist()[i]:
        training_csv.drop(index=i, axis=0, inplace=True)
        training_csv.reset_index()
    i = i + 1

training_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,Nelly got all the hoes singin lol,[],[],0.0,[OffN]
1,Nelson snaps me a pic of him in a cocoon sleep...,[],[],0.0,[OffN]
2,Neocels\n..is the term to describe normies who...,[],[],0.0,[OffN]
3,Nerd followers: I'm monitoring hashtag/mention...,[],[],,[OffN]
4,Net neutrality is pretty irrelevant when a han...,[],[],0.0,[OffN]
...,...,...,...,...,...
58655,⁠Why do wh*te people like to play hockey? It’...,"[""black folks"", ""white folks""]",makes light of racial tension with police,1.0,[OffY]
58656,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,"[""muslim folks""]",muslims are terrorists,1.0,[OffY]
58657,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,"[""muslim folks""]",muslims like bombs,1.0,[OffY]
58658,🚨BREAKING: illegal alien 5x deported on 7 felo...,"[""immigrants""]",immigrants are criminals,1.0,[OffY]


In [11]:
list_group = []
for i in range(0, len(training_csv)):
    list_group.append(",".join(map(str, literal_return(training_csv.iloc[[i]]["group"].values[0]))))
    #list_group.append(",".join(map(str, literal_return(training_csv.iloc[[i]]["group"]) if not None)))
    
training_csv["group"] = list_group
training_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,Nelly got all the hoes singin lol,,[],0.0,[OffN]
1,Nelson snaps me a pic of him in a cocoon sleep...,,[],0.0,[OffN]
2,Neocels\n..is the term to describe normies who...,,[],0.0,[OffN]
3,Nerd followers: I'm monitoring hashtag/mention...,,[],,[OffN]
4,Net neutrality is pretty irrelevant when a han...,,[],0.0,[OffN]
...,...,...,...,...,...
58655,⁠Why do wh*te people like to play hockey? It’...,"black folks,white folks",makes light of racial tension with police,1.0,[OffY]
58656,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims are terrorists,1.0,[OffY]
58657,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims like bombs,1.0,[OffY]
58658,🚨BREAKING: illegal alien 5x deported on 7 felo...,immigrants,immigrants are criminals,1.0,[OffY]


In [12]:
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
17712,Nelly got all the hoes singin lol,,[],0.0,[OffN]
17713,Nelson snaps me a pic of him in a cocoon sleep...,,[],0.0,[OffN]
17714,Neocels\n..is the term to describe normies who...,,[],0.0,[OffN]
17715,Nerd followers: I'm monitoring hashtag/mention...,,[],,[OffN]
17716,Net neutrality is pretty irrelevant when a han...,,[],0.0,[OffN]
...,...,...,...,...,...
76367,⁠Why do wh*te people like to play hockey? It’...,"black folks,white folks",makes light of racial tension with police,1.0,[OffY]
76368,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims are terrorists,1.0,[OffY]
76369,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims like bombs,1.0,[OffY]
76370,🚨BREAKING: illegal alien 5x deported on 7 felo...,immigrants,immigrants are criminals,1.0,[OffY]


In [13]:
dev_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
2333,"Over 10,000 arrests for child trafficking sinc...",,[],,[OffN]
2334,PSA\nY'all should stay away from stupidslutscl...,,[],0.0,[OffN]
2335,PSA: Remember to share the Atomic Blackpill on...,,[],0.0,[OffN]
2336,Partly Cloudy today! With a high of 16C.and a ...,,[],,[OffN]
2337,Party bus going to LA live at this bar/bowling...,,[],,[OffN]
...,...,...,...,...,...
12514,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,died in the holocaust.,1.0,[OffY]
12515,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,have a troubled past,1.0,[OffY]
12516,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,jews deserved to be burned,1.0,[OffY]
12517,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,making a joke about the holocaust.,1.0,[OffY]


# MORE DATA

'acl.id' is the unique ID of the entry.

'Text' is the content which has been entered. All content is synthetic.

'Label' is a binary variable, indicating whether or not the content has been identified as hateful. It takes two values: hate, nothate.

'Type' is a categorical variable, providing a secondary label for hateful content. For hate it can take five values: Animosity, Derogation, Dehumanization, Threatening and Support for Hateful Entities. Please see the paper for more detail. For nothate the 'type' is 'none'. In round 1 the 'type' was not given and is marked as 'notgiven'.

'Target' is a categorical variable, providing the group that is attacked by the hate. It can include intersectional characteristics and multiple groups can be identified. For nothate the type is 'none'. Note that in round 1 the 'target' was not given and is marked as 'notgiven'.

'Level' reports whether the entry is original content or a perturbation.

'Round' is a categorical variable. It gives the round of data entry (1, 2, 3 or 4) with a letter for whether the entry is original content ('a') or a perturbation ('b'). Perturbations were not made for round 1.

'Round.base' is a categorical variable. It gives the round of data entry, indicated with just a number (1, 2, 3 or 4).

'Split' is a categorical variable. it gives the data split that the entry has been assigned to. This can take the values 'train', 'dev' and 'test'. The choice of splits is explained in the paper.

'Annotator' is a categorical variable. It gives the annotator who entered the content. Annotator IDs are random alphanumeric strings. There are 20 annotators in the dataset.

'acl.id.matched' is the ID of the matched entry, connecting the original (given in 'acl.id') and the perturbed version.

For identities (recorded under 'Target') we use shorthand labels to constructed the dataset, which can be converted (and grouped) as follows:

In [14]:
import pandas as pd
c=pd.read_csv("https://raw.githubusercontent.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.2.csv", index_col=0)
c.drop(["acl.id", "X1", "acl.id.matched"], axis=1)
# get dynahate model into formal to test our model on (predict), calculate acc
# columns for training_csv: post, implications, offy/n, whoTarget, group
# columns to use for DynaHate : text (post), label (offy/n), target (group), (no stereotypes? See how model functions)

Unnamed: 0,text,label,type,target,level,split,round.base,annotator,round
1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1
2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1
3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1
4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1
5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1
...,...,...,...,...,...,...,...,...,...
41251,I hate drags lol,nothate,none,none,original,test,1,GNZuCtwed3,1
41252,My washing machine is bitch,nothate,none,none,original,test,1,vDe7GN0NrL,1
41253,Fulham is just a bunch of losers,nothate,none,none,original,test,1,TrRF46JWfP,1
41254,K I L L the T R A N S,hate,notgiven,notgiven,original,test,1,oemYWm1Tjg,1


In [15]:
# none but if its offensive
# round 2 and above only 
c = c[(c["round"] != "1")]
c

Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
1290,acl11158,1290,There are too many costas in this country,nothate,none,none,perturbation,train,2,GNZuCtwed3,2b,acl20309
1291,acl11159,1291,There is way too many m0ng0l0id kids at my sch...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl19858
1293,acl11160,1293,I called my brother a twat and my mom got so m...,nothate,none,none,perturbation,train,2,TrRF46JWfP,2b,acl19997
1294,acl11161,1294,the worst of them are the taaaaaaaaaards becau...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl11162
1295,acl11162,1295,the worst of them are the viruses because they...,nothate,none,none,perturbation,train,2,LqLKTtrOmx,2b,acl11161
...,...,...,...,...,...,...,...,...,...,...,...,...
34727,acl41251,34727,I have that opinion. absolutely everyone I act...,nothate,none,none,perturbation,dev,4,dqrONtdjbt,4b,acl34594
34728,acl41252,34728,"This dark female, I am not going to use the hu...",hate,dehumanization,bla.wom,original,dev,4,CAgNlUizNm,4a,acl35570
34729,acl41253,34729,I guess you are assuming that the french laws ...,hate,derogation,immig,perturbation,dev,4,TbUBpfn6iP,4b,acl35966
34730,acl41254,34730,"I mean, theres no one better right now to give...",nothate,none,none,original,dev,4,GNZuCtwed3,4a,acl36963


In [16]:
c["label"] = c["label"].apply(lambda label: "[OffN]" if label=="nothate" else "[OffY]")

In [17]:
c = c[c['target'].map(c['target'].value_counts()) >= 200].reset_index()

In [18]:
#clean up groupings to be what model is used to from SBIC
c["target"] = c["target"].map({"notgiven" : "notgiven", "wom":"women", "bla":"black", "jew":"jewish", "mus":"muslim", "immig":"immigrants", "asi.south":"south asians", "none" : "none", "transgender" : "transgender people", "ref":"refugee", "arab":"arabic people", "indig" : "indigenous", "mixed.race" : "mixed people", "asi" : "asian", "african" : "african", "hispanic":"hispanic people", "mus.wom" : "muslim women"})

In [19]:
c["target"].value_counts().to_dict()

{'none': 15033,
 'women': 2035,
 'black': 1961,
 'jewish': 1096,
 'muslim': 1002,
 'immigrants': 672,
 'refugee': 470,
 'south asians': 338,
 'arabic people': 338,
 'indigenous': 239,
 'mixed people': 201}

In [20]:
#columns to use for DynaHate : text (post), label (offy/n), target (group), (no stereotypes? See how model functions)
#["post", "group", "implications", "whoTarget", "offensiveYN"]
c.drop(["index", "acl.id", "X1", "round", "acl.id.matched", "annotator", "level"], axis=1, index=None, inplace=True)
c.columns=["post", "offensiveYN", "type", "group", "round.base", "split"]
c

Unnamed: 0,post,offensiveYN,type,group,round.base,split
0,There are too many costas in this country,[OffN],none,none,train,2
1,There is way too many m0ng0l0id kids at my sch...,[OffY],derogation,,train,2
2,I called my brother a twat and my mom got so m...,[OffN],none,none,train,2
3,the worst of them are the taaaaaaaaaards becau...,[OffY],derogation,,train,2
4,the worst of them are the viruses because they...,[OffN],none,none,train,2
...,...,...,...,...,...,...
26868,If she isn't the one for you she's just the on...,[OffN],none,none,dev,4
26869,I have that opinion. absolutely everyone I act...,[OffN],none,none,dev,4
26870,I guess you are assuming that the french laws ...,[OffY],derogation,immigrants,dev,4
26871,"I mean, theres no one better right now to give...",[OffN],none,none,dev,4


In [709]:
#split data into dev, test, and train
dev_dyna, test_dyna, train_dyna = [y for x, y in c.groupby('split', as_index=False)]
dev_dyna.drop(["split"], axis=1, inplace=True)
test_dyna.drop(["split"], axis=1, inplace=True)
train_dyna.drop(["split"], axis=1, inplace=True)
# QUESTION: What rows are necessary, and how are they translatable to SBIC?
# if it's too complicated, you can take only the stage3 posts and see how our model does on the posts that we know have implications? 
# And we can measure the offensiveness classification (ie. stage 1) separately

In [710]:
all_data = pd.DataFrame()
all_data = all_data.append(train_dyna)[train_dyna.columns.tolist()]
all_data = all_data.append(training_csv).dropna(axis=1, how="all")

# Regular Dataset
- [boi] post [eoi] [boo] [offY] [grp/ind] group [ste] stereotype [eoo]
- 50/50 split of Offy and Offn

In [16]:
offY_df = training_csv.where(training_csv["offensiveYN"] == "[OffY]").dropna(subset=['offensiveYN']).sample(n=num_offY).reset_index()
num_offY = len(offY_df.index)
offN_df = training_csv.where(training_csv["offensiveYN"] == "[OffN]").dropna(subset=['offensiveYN'])
fifty_df = pd.concat([offY_df, offN_df])
# Shuffle the indices
fifty_df = fifty_df.sample(frac=1).reset_index(drop=True)

In [17]:
fifty_df

Unnamed: 0,index,post,group,implications,whoTarget,offensiveYN
0,50588.0,"More Negro rioting , this time at a Black Pant...",black folks,proen to violence,1.0,[OffY]
1,36795.0,"@Drake thank you for the lyrics, these bitches...",women,women are sluts.,1.0,[OffY]
2,32241.0,"if a muslim beats his wife , would it be domes...","assault victims,muslim folks,women","[""are really children"", ""get beat by husbands""...",1.0,[OffY]
3,40247.0,Disco lights make people dance Just look at my...,folks with physical illness/disorder,makes fun of people with epilepsy,1.0,[OffY]
4,46002.0,I love hanging with Jamal Gotta love a good ly...,black folks,gave names like jamal,1.0,[OffY]
...,...,...,...,...,...,...
58655,50068.0,Literally mind-blowing... I met a girl with an...,"assault victims,women",women don't deserve to live,1.0,[OffY]
58656,64011.0,Whats the difference between a suicidal muslim...,"jewish folks, muslim folks",muslim's are terrorists,1.0,[OffY]
58657,22493.0,The more snowflake libtards hate President Tru...,liberals,"[""are unintelligent"", ""liberals are retarded"",...",1.0,[OffY]
58658,,i had this so many times. I once dreamt about ...,,[],,[OffN]


In [19]:
training_csv = fifty_df

In [20]:
final_input = []
final_output = []
for i in training_csv.index:
    final_input.append("[boi] " + str(training_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [21]:
regular_df = pd.DataFrame()

In [22]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [23]:
regular_df

Unnamed: 0,input,output
0,[boi] What is the difference between a Tree an...,[boo] [OffY] [grp] black folks [ste] black peo...
1,"[boi] What’s pink, six inches long, and make m...",[boo] [OffY] [grp] women [ste] forcing an abor...
2,"[boi] If Normies really wanted to help us, the...",[boo] [OffN] [eoo]
3,[boi] Who are the worst type of women Jewish b...,"[boo] [OffY] [grp] jewish folks,women [ste] je..."
4,[boi] when your not so little cousins text you...,[boo] [OffN] [eoo]
...,...,...
24247,[boi] What do you call a porn actress with dia...,[boo] [OffY] [grp] porn stars [ste] porn stars...
24248,[boi] Why did Princess Diana cross the road? B...,[boo] [OffN] [eoo]
24249,[boi] Glad it was false because where would we...,[boo] [OffN] [eoo]
24250,"[boi] Nowadays, I have sex every single day Ju...",[boo] [OffN] [eoo]


In [24]:
regular_df.to_csv("SBIC.trn.3.5050.csv", index=False)

In [25]:
offY_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffY]").dropna(subset=['offensiveYN'])
num_offY = len(offY_df.index)
offN_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffN]").dropna(subset=['offensiveYN']).sample(n=num_offY).reset_index()
fifty_df = pd.concat([offY_df, offN_df])
# Shuffle the indices
fifty_df = fifty_df.sample(frac=1).reset_index(drop=True)

In [26]:
fifty_df

Unnamed: 0,post,group,implications,whoTarget,offensiveYN,index
0,You can't tell me that african child is starvi...,"Africans,black folks",african kids are starving.,1.0,[OffY],
1,Why do you never see a black person with Down ...,"black folks,folks with physical illness/disord...",are punished by god,1.0,[OffY],
2,Jews are like water They're both solid when th...,"Holocaust Victims,holocaust victims,jewish folks",jewish folks are objects,1.0,[OffY],
3,@BostonJerry @Elle__Chapo they think we don't ...,,,,[OffN],343.0
4,If you blow hard enough on an old Nintendo car...,,,0.0,[OffN],1794.0
...,...,...,...,...,...,...
3607,Black people are some of the best people I’ve ...,,,0.0,[OffN],622.0
3608,What's the difference between an oven and a ba...,"assault victims,kids",belittles the severe pain that rape victims feel,1.0,[OffY],
3609,Cause it's from 7 years ago? Fucking unreal ht...,,,,[OffN],683.0
3610,"I'm so happy I met you, I couldn't imagine lif...",,,0.0,[OffN],1720.0


In [27]:
dev_csv = fifty_df

In [28]:
final_input = []
final_output = []
for i in dev_csv.index:
    final_input.append("[boi] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [29]:
regular_df = pd.DataFrame()

In [30]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [31]:
regular_df

Unnamed: 0,input,output
0,[boi] You can't tell me that african child is ...,"[boo] [OffY] [grp] Africans,black folks [ste] ..."
1,[boi] Why do you never see a black person with...,"[boo] [OffY] [grp] black folks,folks with phys..."
2,[boi] Jews are like water They're both solid w...,"[boo] [OffY] [grp] Holocaust Victims,holocaust..."
3,[boi] @BostonJerry @Elle__Chapo they think we ...,[boo] [OffN] [eoo]
4,[boi] If you blow hard enough on an old Ninten...,[boo] [OffN] [eoo]
...,...,...
3607,[boi] Black people are some of the best people...,[boo] [OffN] [eoo]
3608,[boi] What's the difference between an oven an...,"[boo] [OffY] [grp] assault victims,kids [ste] ..."
3609,[boi] Cause it's from 7 years ago? Fucking unr...,[boo] [OffN] [eoo]
3610,"[boi] I'm so happy I met you, I couldn't imagi...",[boo] [OffN] [eoo]


In [32]:
regular_df.to_csv("SBIC.dev.3.5050.csv", index=False)

# Reading in 50/50 data

In [52]:
preds = pd.read_csv("predictions_3_pt3.csv", index_col=0)

In [53]:
preds["Source Text"] = preds["Source Text"].apply(lambda x: x.replace("[boi] ", "")).apply(lambda x: x.replace(" [eoi]", ""))

preds["Generated Text"] = preds["Generated Text"].apply(lambda x: x.replace("<pad> ", "")).apply(lambda x: x.replace(" <pad>", ""))
preds.columns = ["post", "output"]

In [None]:
pd.set_option('display.width', 1000)
preds["output"].tolist()

In [49]:
y_test = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)[:1000]
y_test

Unnamed: 0,post,targetMinority,targetCategory,targetStereotype,whoTarget,intentYN,sexYN,offensiveYN,dataSource,hasBiasedImplication
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,[],[],[],,0.110000,0.000000,0.000000,Gab,1
1,"they found marijuana, butane hash oil (BHO) a...",[],[],[],0.0,0.110000,0.000000,0.166667,Gab,1
2,""""""" I have no problem being called Benny! Just...",[],[],[],0.0,0.553333,0.000000,0.500000,t/founta,1
3,""""""" animal friend in My ass"""" You need some He...",[],[],[],0.0,0.886667,0.166667,1.000000,t/founta,1
4,"""""""A mans belly shall be satisfied with the fr...",[],[],[],0.0,0.000000,0.000000,0.000000,t/founta,1
...,...,...,...,...,...,...,...,...,...,...
996,"Guy next to me on plane:: So, where are you fr...","[""asian folks""]","[""race""]","[""asians all look alike"", ""asians are quiet"", ...",1.0,0.830000,0.000000,1.000000,redditMicroagressions,0
997,"Guy wants to fuck a girl, not learn basic carp...",[],[],[],0.0,0.330000,1.000000,0.500000,redditMicroagressions,1
998,Guys! I am not Justin Bieber! Stop sending me ...,[],[],[],,0.000000,0.000000,0.000000,t/founta,1
999,HAHA @ThaRodickk Im not sexist....but I honest...,"[""men"", ""women""]","[""gender""]","[""men make good broadcasters"", ""men only care ...",1.0,1.000000,0.000000,1.000000,t/waseem,0


In [50]:
merged_df = pd.merge(preds, y_test, on="post")
merged_df["offensiveYN"] = merged_df["offensiveYN"].apply(lambda label: 1 if label == 1 else 0)

In [51]:
merged_df["offYNPredictions"] = merged_df["output"].apply(lambda label: 1 if "OffY" in label else 0)

In [52]:
subset_merged_df = merged_df[["output", "targetMinority", "targetCategory", "targetStereotype", "offensiveYN"]]
subset_merged_df[merged_df.output.str.contains('OffY')].head(50)

Unnamed: 0,output,targetMinority,targetCategory,targetStereotype,offensiveYN


# Scrambled Dataset
- need to overrepresent offensive samples and have more structure with the tokens so the model learns
- create each kind of scramble for each input
#### Most important:
1) generate stereotype given just post
2) generate stereotype given post, group, and offensiveness
3) generate post given group, sterotype, and offensiveness

Making three kinds of scrambles:
1) regular, with new tokens
2) full scramble, with new tokens
3) important scramble, with new tokens

[bos] post [EOS] [offN] [eos] \
[offN] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [ind] [eos]
[offY] [ind] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [grp/ind] group [ste] stereotype [eos] \
[bos] post [offY] [grp/ind] group [CLS] [ste] stereotype [eos] \
[bos] post [offY] [CLS] [grp/ind] group [ste] stereotype [eos] 

[offY] [grp/ind] group [ste] stereotype [CLS] [bos] post [eos] \
[offY] [grp/ind] group [CLS] post [ste] stereotype [eos] \
[offY] [grp/ind] group [CLS] [ste] stereotype [bos] post [eos] \
[offY] [ste] stereotype [CLS] [bos] post [grp/ind] group [eos]

[offY] [CLS] [bos] post [ste] stereotype [grp/ind] group [eos] 


In [36]:
final_input = []
final_output = []
for i in training_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] [bos]" + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")         
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [37]:
full_scramble_df = pd.DataFrame()

In [38]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [39]:
full_scramble_df

Unnamed: 0,input,output
0,[boi] [OffY] [grp] black folks [ste] black peo...,[boo] [bos] What is the difference between a T...
1,[boi] [OffY] [grp] black folks [eoi],[boo] [bos] What is the difference between a T...
2,[boi] [OffY] [eoi],[boo] [bos]What is the difference between a Tr...
3,[boi] [bos] What is the difference between a T...,[boo] [OffY] [grp] black folks [ste] black peo...
4,[boi] [OffY] [grp] women [ste] forcing an abor...,"[boo] [bos] What’s pink, six inches long, and ..."
...,...,...
60605,"[boi] [bos] Nowadays, I have sex every single ...",[boo] [OffN] [eoo]
60606,[boi] [OffY] [grp] jewish folks [ste] are gass...,"[boo] [bos] Guys, I recommend Hitler's latest ..."
60607,[boi] [OffY] [grp] jewish folks [eoi],"[boo] [bos] Guys, I recommend Hitler's latest ..."
60608,[boi] [OffY] [eoi],"[boo] [bos]Guys, I recommend Hitler's latest c..."


In [40]:
full_scramble_df.to_csv("SBIC.trn.scramble.3.5050.csv", index=False)

In [41]:
final_input = []
final_output = []
for i in dev_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] [bos]" + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")         
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [42]:
full_scramble_df = pd.DataFrame()

In [43]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [44]:
full_scramble_df

Unnamed: 0,input,output
0,"[boi] [OffY] [grp] Africans,black folks [ste] ...",[boo] [bos] You can't tell me that african chi...
1,"[boi] [OffY] [grp] Africans,black folks [eoi]",[boo] [bos] You can't tell me that african chi...
2,[boi] [OffY] [eoi],[boo] [bos]You can't tell me that african chil...
3,[boi] [bos] You can't tell me that african chi...,"[boo] [OffY] [grp] Africans,black folks [ste] ..."
4,"[boi] [OffY] [grp] black folks,folks with phys...",[boo] [bos] Why do you never see a black perso...
...,...,...
9023,[boi] [OffY] [eoi],[boo] [bos]What's the difference between an ov...
9024,[boi] [bos] What's the difference between an o...,"[boo] [OffY] [grp] assault victims,kids [ste] ..."
9025,[boi] [bos] Cause it's from 7 years ago? Fucki...,[boo] [OffN] [eoo]
9026,"[boi] [bos] I'm so happy I met you, I couldn't...",[boo] [OffN] [eoo]


In [45]:
full_scramble_df.to_csv("SBIC.dev.scramble.3.5050.csv", index=False)