# Loading in Dataset

In [52]:
import pandas as pd
import numpy as np

In [53]:
training_csv = pd.read_csv("SBIC.v2.agg.trn.csv", index_col=0)
dev_csv = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)

### Manipulating the CSV file so that we only use the columns specified by Maarten

In [54]:
training_csv = training_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
training_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",[],[],0.0,1
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,[],[],,1
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,"[""liberals""]","[""miscontrue things"", ""take things in a weird ...",1.0,0
3,\nDavid Knight‏ \n\nNotice how quickly things ...,[],[],0.0,1
4,\nFinland fireball: Time-lapse video shows nig...,[],[],,1
...,...,...,...,...,...
35498,👉 Illegally in the country after 5 deportation...,[],[],,1
35500,💥Breaking💥\nJulian Assange is the gate keeper ...,[],[],0.0,1
35501,📖 2Kings 22:19 because your heart was peniten...,[],[],,1
35502,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,[],[],0.0,1


In [55]:
dev_csv = dev_csv.drop(["dataSource", "offensiveYN", "sexYN", "intentYN", "targetCategory"], axis=1, index=None)
dev_csv.columns = ["post", "group", "implications", "whoTarget", "offensiveYN"]
dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,\nRed Nation Rising\n‏\n\n \n@RedNationRising...,[],[],,1
1,"they found marijuana, butane hash oil (BHO) a...",[],[],0.0,1
2,""""""" I have no problem being called Benny! Just...",[],[],0.0,1
3,""""""" animal friend in My ass"""" You need some He...",[],[],0.0,1
4,"""""""A mans belly shall be satisfied with the fr...",[],[],0.0,1
...,...,...,...,...,...
4661,“One special advantage of the skeptical attitu...,[],[],0.0,1
4662,“The only thing worse than living in a country...,"[""black folks, asian folks, latino/latina folk...","[""non-white people ruin the life of white peop...",1.0,0
4663,“Two things are infinite: the universe and hum...,[],[],0.0,1
4664,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]","[""all jews have big noses"", ""died in the holoc...",1.0,0


### Putting data in the correct format to work with

In [56]:
training_csv["offensiveYN"] = training_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [57]:
dev_csv["offensiveYN"] = dev_csv["offensiveYN"].apply(lambda label: "[OffN]" if label==1 else "[OffY]")

In [58]:
def literal_return(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return []

In [59]:
import ast
i = 0;
original_len = len(dev_csv)
while i < original_len:
    list_val = literal_return(dev_csv["implications"].tolist()[i])
    if (list_val != None and len(list_val) != 0):
        s=dev_csv.iloc[[i],] # pick the row you want to do repeat
        s=s.reindex(s.index.repeat(len(list_val)))# repeat the row by the giving number 
        s.loc[:,'implications']=list_val # change the value
        dev_csv=pd.concat([dev_csv,s]) #append to the original df 
        dev_csv.reset_index(drop=True, inplace=True)
    i = i + 1

#do final pass over the implications and get rid of list format
i = 0;
while i < original_len:
    if "[" in dev_csv["implications"].tolist()[i]:
        dev_csv.drop(index=i, axis=0, inplace=True)
        dev_csv.reset_index()
    i = i + 1

dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"Over 10,000 arrests for child trafficking sinc...",[],[],,[OffN]
1,PSA\nY'all should stay away from stupidslutscl...,[],[],0.0,[OffN]
2,PSA: Remember to share the Atomic Blackpill on...,[],[],0.0,[OffN]
3,Partly Cloudy today! With a high of 16C.and a ...,[],[],,[OffN]
4,Party bus going to LA live at this bar/bowling...,[],[],,[OffN]
...,...,...,...,...,...
10181,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",died in the holocaust.,1.0,[OffY]
10182,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",have a troubled past,1.0,[OffY]
10183,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",jews deserved to be burned,1.0,[OffY]
10184,♪ &#9835; ♬ believe it or not i 'm gassing the...,"[""jewish folks""]",making a joke about the holocaust.,1.0,[OffY]


In [60]:
list_group = []
for i in range(0, len(dev_csv)):
    list_group.append(",".join(map(str, literal_return(dev_csv.iloc[[i]]["group"].values[0]))))
    #list_group.append(",".join(map(str, literal_return(dev_csv.iloc[[i]]["group"]) if not None)))
    
dev_csv["group"] = list_group
dev_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,"Over 10,000 arrests for child trafficking sinc...",,[],,[OffN]
1,PSA\nY'all should stay away from stupidslutscl...,,[],0.0,[OffN]
2,PSA: Remember to share the Atomic Blackpill on...,,[],0.0,[OffN]
3,Partly Cloudy today! With a high of 16C.and a ...,,[],,[OffN]
4,Party bus going to LA live at this bar/bowling...,,[],,[OffN]
...,...,...,...,...,...
10181,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,died in the holocaust.,1.0,[OffY]
10182,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,have a troubled past,1.0,[OffY]
10183,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,jews deserved to be burned,1.0,[OffY]
10184,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,making a joke about the holocaust.,1.0,[OffY]


In [61]:
import ast
i = 0;
original_len = len(training_csv)
while i < original_len:
    list_val = literal_return(training_csv["implications"].tolist()[i])
    if (list_val != None and len(list_val) != 0):
        s=training_csv.iloc[[i],] # pick the row you want to do repeat
        s=s.reindex(s.index.repeat(len(list_val)))# repeat the row by the giving number 
        s.loc[:,'implications']=list_val # change the value
        training_csv=pd.concat([training_csv,s]) #append to the original df 
        training_csv.reset_index(drop=True, inplace=True)
    i = i + 1

#do final pass over the implications and get rid of list format
i = 0;
while i < original_len:
    if "[" in training_csv["implications"].tolist()[i]:
        training_csv.drop(index=i, axis=0, inplace=True)
        training_csv.reset_index()
    i = i + 1

training_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,Nelly got all the hoes singin lol,[],[],0.0,[OffN]
1,Nelson snaps me a pic of him in a cocoon sleep...,[],[],0.0,[OffN]
2,Neocels\n..is the term to describe normies who...,[],[],0.0,[OffN]
3,Nerd followers: I'm monitoring hashtag/mention...,[],[],,[OffN]
4,Net neutrality is pretty irrelevant when a han...,[],[],0.0,[OffN]
...,...,...,...,...,...
58655,⁠Why do wh*te people like to play hockey? It’...,"[""black folks"", ""white folks""]",makes light of racial tension with police,1.0,[OffY]
58656,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,"[""muslim folks""]",muslims are terrorists,1.0,[OffY]
58657,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,"[""muslim folks""]",muslims like bombs,1.0,[OffY]
58658,🚨BREAKING: illegal alien 5x deported on 7 felo...,"[""immigrants""]",immigrants are criminals,1.0,[OffY]


In [62]:
list_group = []
for i in range(0, len(training_csv)):
    list_group.append(",".join(map(str, literal_return(training_csv.iloc[[i]]["group"].values[0]))))
    #list_group.append(",".join(map(str, literal_return(training_csv.iloc[[i]]["group"]) if not None)))
    
training_csv["group"] = list_group
training_csv.reset_index(drop=True)

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
0,Nelly got all the hoes singin lol,,[],0.0,[OffN]
1,Nelson snaps me a pic of him in a cocoon sleep...,,[],0.0,[OffN]
2,Neocels\n..is the term to describe normies who...,,[],0.0,[OffN]
3,Nerd followers: I'm monitoring hashtag/mention...,,[],,[OffN]
4,Net neutrality is pretty irrelevant when a han...,,[],0.0,[OffN]
...,...,...,...,...,...
58655,⁠Why do wh*te people like to play hockey? It’...,"black folks,white folks",makes light of racial tension with police,1.0,[OffY]
58656,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims are terrorists,1.0,[OffY]
58657,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims like bombs,1.0,[OffY]
58658,🚨BREAKING: illegal alien 5x deported on 7 felo...,immigrants,immigrants are criminals,1.0,[OffY]


In [63]:
training_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
17712,Nelly got all the hoes singin lol,,[],0.0,[OffN]
17713,Nelson snaps me a pic of him in a cocoon sleep...,,[],0.0,[OffN]
17714,Neocels\n..is the term to describe normies who...,,[],0.0,[OffN]
17715,Nerd followers: I'm monitoring hashtag/mention...,,[],,[OffN]
17716,Net neutrality is pretty irrelevant when a han...,,[],0.0,[OffN]
...,...,...,...,...,...
76367,⁠Why do wh*te people like to play hockey? It’...,"black folks,white folks",makes light of racial tension with police,1.0,[OffY]
76368,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims are terrorists,1.0,[OffY]
76369,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,muslim folks,muslims like bombs,1.0,[OffY]
76370,🚨BREAKING: illegal alien 5x deported on 7 felo...,immigrants,immigrants are criminals,1.0,[OffY]


In [64]:
dev_csv

Unnamed: 0,post,group,implications,whoTarget,offensiveYN
2333,"Over 10,000 arrests for child trafficking sinc...",,[],,[OffN]
2334,PSA\nY'all should stay away from stupidslutscl...,,[],0.0,[OffN]
2335,PSA: Remember to share the Atomic Blackpill on...,,[],0.0,[OffN]
2336,Partly Cloudy today! With a high of 16C.and a ...,,[],,[OffN]
2337,Party bus going to LA live at this bar/bowling...,,[],,[OffN]
...,...,...,...,...,...
12514,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,died in the holocaust.,1.0,[OffY]
12515,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,have a troubled past,1.0,[OffY]
12516,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,jews deserved to be burned,1.0,[OffY]
12517,♪ &#9835; ♬ believe it or not i 'm gassing the...,jewish folks,making a joke about the holocaust.,1.0,[OffY]


# MORE DATA

'acl.id' is the unique ID of the entry.

'Text' is the content which has been entered. All content is synthetic.

'Label' is a binary variable, indicating whether or not the content has been identified as hateful. It takes two values: hate, nothate.

'Type' is a categorical variable, providing a secondary label for hateful content. For hate it can take five values: Animosity, Derogation, Dehumanization, Threatening and Support for Hateful Entities. Please see the paper for more detail. For nothate the 'type' is 'none'. In round 1 the 'type' was not given and is marked as 'notgiven'.

'Target' is a categorical variable, providing the group that is attacked by the hate. It can include intersectional characteristics and multiple groups can be identified. For nothate the type is 'none'. Note that in round 1 the 'target' was not given and is marked as 'notgiven'.

'Level' reports whether the entry is original content or a perturbation.

'Round' is a categorical variable. It gives the round of data entry (1, 2, 3 or 4) with a letter for whether the entry is original content ('a') or a perturbation ('b'). Perturbations were not made for round 1.

'Round.base' is a categorical variable. It gives the round of data entry, indicated with just a number (1, 2, 3 or 4).

'Split' is a categorical variable. it gives the data split that the entry has been assigned to. This can take the values 'train', 'dev' and 'test'. The choice of splits is explained in the paper.

'Annotator' is a categorical variable. It gives the annotator who entered the content. Annotator IDs are random alphanumeric strings. There are 20 annotators in the dataset.

'acl.id.matched' is the ID of the matched entry, connecting the original (given in 'acl.id') and the perturbed version.

For identities (recorded under 'Target') we use shorthand labels to constructed the dataset, which can be converted (and grouped) as follows:

In [65]:
import pandas as pd
c=pd.read_csv("https://raw.githubusercontent.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.2.csv", index_col=0)
c.drop(["acl.id", "X1", "acl.id.matched"], axis=1)
# get dynahate model into formal to test our model on (predict), calculate acc
# columns for training_csv: post, implications, offy/n, whoTarget, group
# columns to use for DynaHate : text (post), label (offy/n), target (group), (no stereotypes? See how model functions)

Unnamed: 0,text,label,type,target,level,split,round.base,annotator,round
1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1
2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1
3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1
4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1
5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1
...,...,...,...,...,...,...,...,...,...
41251,I hate drags lol,nothate,none,none,original,test,1,GNZuCtwed3,1
41252,My washing machine is bitch,nothate,none,none,original,test,1,vDe7GN0NrL,1
41253,Fulham is just a bunch of losers,nothate,none,none,original,test,1,TrRF46JWfP,1
41254,K I L L the T R A N S,hate,notgiven,notgiven,original,test,1,oemYWm1Tjg,1


In [66]:
# none but if its offensive
# round 2 and above only 
c = c[(c["round"] != "1")]
c

Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
1290,acl11158,1290,There are too many costas in this country,nothate,none,none,perturbation,train,2,GNZuCtwed3,2b,acl20309
1291,acl11159,1291,There is way too many m0ng0l0id kids at my sch...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl19858
1293,acl11160,1293,I called my brother a twat and my mom got so m...,nothate,none,none,perturbation,train,2,TrRF46JWfP,2b,acl19997
1294,acl11161,1294,the worst of them are the taaaaaaaaaards becau...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl11162
1295,acl11162,1295,the worst of them are the viruses because they...,nothate,none,none,perturbation,train,2,LqLKTtrOmx,2b,acl11161
...,...,...,...,...,...,...,...,...,...,...,...,...
34727,acl41251,34727,I have that opinion. absolutely everyone I act...,nothate,none,none,perturbation,dev,4,dqrONtdjbt,4b,acl34594
34728,acl41252,34728,"This dark female, I am not going to use the hu...",hate,dehumanization,bla.wom,original,dev,4,CAgNlUizNm,4a,acl35570
34729,acl41253,34729,I guess you are assuming that the french laws ...,hate,derogation,immig,perturbation,dev,4,TbUBpfn6iP,4b,acl35966
34730,acl41254,34730,"I mean, theres no one better right now to give...",nothate,none,none,original,dev,4,GNZuCtwed3,4a,acl36963


In [67]:
c["label"] = c["label"].apply(lambda label: "[OffN]" if label=="nothate" else "[OffY]")

In [68]:
c = c[c['target'].map(c['target'].value_counts()) >= 200].reset_index()

In [69]:
#clean up groupings to be what model is used to from SBIC
c["target"] = c["target"].map({"notgiven" : "notgiven", "wom":"women", "bla":"black", "jew":"jewish", "mus":"muslim", "immig":"immigrants", "asi.south":"south asians", "none" : "none", "transgender" : "transgender people", "ref":"refugee", "arab":"arabic people", "indig" : "indigenous", "mixed.race" : "mixed people", "asi" : "asian", "african" : "african", "hispanic":"hispanic people", "mus.wom" : "muslim women"})

In [70]:
c["target"].value_counts().to_dict()

{'none': 15033,
 'women': 2035,
 'black': 1961,
 'jewish': 1096,
 'muslim': 1002,
 'immigrants': 672,
 'refugee': 470,
 'south asians': 338,
 'arabic people': 338,
 'indigenous': 239,
 'mixed people': 201}

In [71]:
#columns to use for DynaHate : text (post), label (offy/n), target (group), (no stereotypes? See how model functions)
#["post", "group", "implications", "whoTarget", "offensiveYN"]
c.drop(["index", "acl.id", "X1", "round", "acl.id.matched", "annotator", "level"], axis=1, index=None, inplace=True)
c.columns=["post", "offensiveYN", "type", "group", "round.base", "split"]
c

Unnamed: 0,post,offensiveYN,type,group,round.base,split
0,There are too many costas in this country,[OffN],none,none,train,2
1,There is way too many m0ng0l0id kids at my sch...,[OffY],derogation,,train,2
2,I called my brother a twat and my mom got so m...,[OffN],none,none,train,2
3,the worst of them are the taaaaaaaaaards becau...,[OffY],derogation,,train,2
4,the worst of them are the viruses because they...,[OffN],none,none,train,2
...,...,...,...,...,...,...
26868,If she isn't the one for you she's just the on...,[OffN],none,none,dev,4
26869,I have that opinion. absolutely everyone I act...,[OffN],none,none,dev,4
26870,I guess you are assuming that the french laws ...,[OffY],derogation,immigrants,dev,4
26871,"I mean, theres no one better right now to give...",[OffN],none,none,dev,4


In [72]:
#split data into dev, test, and train
dev_dyna, test_dyna, train_dyna = [y for x, y in c.groupby('split', as_index=False)]
dev_dyna.drop(["split"], axis=1, inplace=True)
test_dyna.drop(["split"], axis=1, inplace=True)
train_dyna.drop(["split"], axis=1, inplace=True)
# QUESTION: What rows are necessary, and how are they translatable to SBIC?
# if it's too complicated, you can take only the stage3 posts and see how our model does on the posts that we know have implications? 
# And we can measure the offensiveness classification (ie. stage 1) separately

In [73]:
all_data_dev = pd.DataFrame()
all_data_dev = all_data_dev.append(dev_dyna)[dev_dyna.columns.tolist()]
all_data_dev = all_data_dev.append(dev_csv).dropna(axis=1, how="all")
all_data_dev["whoTarget"] = all_data_dev["whoTarget"].replace(np.NaN, 0.0)
all_data_dev.reset_index()

Unnamed: 0,index,post,offensiveYN,type,group,round.base,implications,whoTarget
0,0,There are too many costas in this country,[OffN],none,none,train,,0.0
1,1,There is way too many m0ng0l0id kids at my sch...,[OffY],derogation,,train,,0.0
2,2,I called my brother a twat and my mom got so m...,[OffN],none,none,train,,0.0
3,3,the worst of them are the taaaaaaaaaards becau...,[OffY],derogation,,train,,0.0
4,4,the worst of them are the viruses because they...,[OffN],none,none,train,,0.0
...,...,...,...,...,...,...,...,...
19080,12514,♪ &#9835; ♬ believe it or not i 'm gassing the...,[OffY],,jewish folks,,died in the holocaust.,1.0
19081,12515,♪ &#9835; ♬ believe it or not i 'm gassing the...,[OffY],,jewish folks,,have a troubled past,1.0
19082,12516,♪ &#9835; ♬ believe it or not i 'm gassing the...,[OffY],,jewish folks,,jews deserved to be burned,1.0
19083,12517,♪ &#9835; ♬ believe it or not i 'm gassing the...,[OffY],,jewish folks,,making a joke about the holocaust.,1.0


In [74]:
all_data = pd.DataFrame()
all_data = all_data.append(train_dyna)[train_dyna.columns.tolist()]
all_data = all_data.append(training_csv).dropna(axis=1, how="all")
all_data["whoTarget"] = all_data["whoTarget"].replace(np.NaN, 0.0)
all_data.reset_index()

Unnamed: 0,index,post,offensiveYN,type,group,round.base,implications,whoTarget
0,17850,Very much so. The refugee migrants are a can o...,[OffY],derogation,refugee,train,,0.0
1,17851,"Sorry lads, this isn't a happy ever after and ...",[OffY],derogation,refugee,train,,0.0
2,17852,I feel very happy to live in Eastern Europe ri...,[OffN],none,none,train,,0.0
3,17853,We offer protection and welcome refugees from ...,[OffY],derogation,refugee,train,,0.0
4,17854,We offer protection and welcome refugees from ...,[OffN],none,none,train,,0.0
...,...,...,...,...,...,...,...,...
67678,76367,⁠Why do wh*te people like to play hockey? It’...,[OffY],,"black folks,white folks",,makes light of racial tension with police,1.0
67679,76368,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,[OffY],,muslim folks,,muslims are terrorists,1.0
67680,76369,🏢🏢 👳🏽‍♂️✈️➡️➡️🏢🏢➡️➡️💥💥➡️➡️🤷🏻‍♂️🤷🏻‍♂️\n\n,[OffY],,muslim folks,,muslims like bombs,1.0
67681,76370,🚨BREAKING: illegal alien 5x deported on 7 felo...,[OffY],,immigrants,,immigrants are criminals,1.0


In [75]:
#merge each file
from functools import reduce
df1 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg1.tsv", sep='\t')
df2 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_SAP_posts.tsv", sep="\t")
df3 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg2.tsv", sep="\t")
df4 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg3.tsv", sep="\t")
df5 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg2_posts.tsv", sep="\t")
df6 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg3_posts.tsv", sep="\t")
df7 = pd.read_csv("implicit-hate-corpus/implicit_hate_v1_stg1_posts.tsv", sep="\t")

# combine each stage together
stg1_df = df1.merge(df7["post"], left_index=True, right_index=True)
#stg2_df = df3.merge(df5["post"], left_index=True, right_index=True)
stg3_df = df4.merge(df6["post"], left_index=True, right_index=True)
total_df = pd.concat([stg1_df, stg3_df], ignore_index=True)
total_df
#data_frames = [df1, df2, df3, df4, df5, df6, df7]
#df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['ID'],
#                                            how='outer'), data_frames)

Unnamed: 0,ID,class,post,target,implied_statement
0,399886440588247041,implicit_hate,""" : jewish harvard professor noel ignatiev w...",,
1,929901925100937216,not_hate,b.higher education is a part of european cult...,,
2,728678509497954304,not_hate,"has a problem with "" the whites "" "" and "" "" ...",,
3,625688620444180481,not_hate,is yasir qadhi a hate preacher for calling ch...,,
4,441089979322597376,not_hate,"rt "" : how three million germans mass murder...",,
...,...,...,...,...,...
27834,884073715490861056,,your people came to a friendly and highly popu...,Asian folks,Asian folks have destroyed the country
27835,603471575833137153,,it's as if sexual abuse follows perkins around .,Immigrants,Immigrants are sexual offenders
27836,669074062862368768,,every india lover must boycott these pigs movies,Indian folks,Indian folks are pigs
27837,466220911675179008,,ask more rss intellectuals to come on powerful...,Indian folks,Indian folks are uneducated


In [76]:
stg1_df.drop(["ID", "class"], axis=1, inplace=True)
stg1_df["offensiveYN"] = ["[OffN]"] * len(stg1_df)
stg1_df["whoTarget"] = [1.0] * len(stg1_df)

In [77]:
#Add offn from stage one to balance out 
# deduplicate non offensive posts (compared to SBIC) 
stg3_df.drop(["ID"], axis=1, inplace=True)
stg3_df.columns = ["group", "implications", "post"]
stg3_df["offensiveYN"] = ["[OffY]"] * len(stg3_df)
stg3_df["whoTarget"] = [1.0] * len(stg3_df)

In [78]:
stg3_df = stg3_df.append(stg1_df)

In [79]:
stg3_df

Unnamed: 0,group,implications,post,offensiveYN,whoTarget
0,minority groups,people in minority groups are all in gangs,"rt "" white privilege "" "" is when the gun jam...",[OffY],1.0
1,non-white people,non-white people are dangerous,if you are white in that state watch yourself !,[OffY],1.0
2,Immigrants,Immigrants are undesirable,so the 500 % increase in h1b visas ? yeah i'm ...,[OffY],1.0
3,Anti-whites,Anti-whites hate white people,rt antiwhites expose their hatred of white ki...,[OffY],1.0
4,Capitalists,Capitalists don't like white people.,capitalism like communism is an enemy of the...,[OffY],1.0
...,...,...,...,...,...
21475,,,"Seeing prostitutes is morally wrong, but being...",[OffN],1.0
21476,,,I wonder how many females I raped today I spen...,[OffN],1.0
21477,,,Having a criminal record is more attractive to...,[OffN],1.0
21478,,,Another advantage of being a female: getting b...,[OffN],1.0


In [80]:
#split latent hate into 3 datasets 70/15/15
train_lh, validate_lh, test_lh = \
              np.split(stg3_df.sample(frac=1, random_state=42), 
                       [int(.7*len(stg3_df)), int(.85*len(stg3_df))])

In [81]:
all_data = all_data.append(train_lh).reset_index()
#all_data["implications"] = all_data["implications"].replace(np.NaN, "[]", inplace=True)

In [82]:
all_data_dev = all_data_dev.append(validate_lh).reset_index()

In [83]:
all_data.drop(["index"], axis=1, inplace=True)
all_data_dev.drop(["index"], axis=1, inplace=True)

In [84]:
training_csv = all_data
dev_csv = all_data_dev

In [85]:
training_csv

Unnamed: 0,post,offensiveYN,type,group,round.base,implications,whoTarget
0,Very much so. The refugee migrants are a can o...,[OffY],derogation,refugee,train,,0.0
1,"Sorry lads, this isn't a happy ever after and ...",[OffY],derogation,refugee,train,,0.0
2,I feel very happy to live in Eastern Europe ri...,[OffN],none,none,train,,0.0
3,We offer protection and welcome refugees from ...,[OffY],derogation,refugee,train,,0.0
4,We offer protection and welcome refugees from ...,[OffN],none,none,train,,0.0
...,...,...,...,...,...,...,...
87165,there's only one side : the side holding a fre...,[OffN],,,,,1.0
87166,gosh darn to bad we whites deserve it for bei...,[OffN],,,,,1.0
87167,would not be to start with without whites . do...,[OffN],,,,,1.0
87168,jews freak out after german policemen play goe...,[OffN],,,,,1.0


# Regular Dataset
- [boi] post [eoi] [boo] [offY] [grp/ind] group [ste] stereotype [eoo]
- 50/50 split of Offy and Offn

In [None]:
final_input = []
final_output = []
for i in training_csv.index:
    final_input.append("[boi] " + str(training_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [None]:
regular_df = pd.DataFrame()

In [None]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [None]:
df = regular_df[regular_df['output'].str.contains('ind')]

In [None]:
df["input"].values.tolist()[5]

In [None]:
regular_df.to_csv("SBIC.trn.4.reg.csv", index=False)

In [None]:
offY_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffY]").dropna(subset=['offensiveYN'])
num_offY = len(offY_df.index)
offN_df = dev_csv.where(dev_csv["offensiveYN"] == "[OffN]").dropna(subset=['offensiveYN']).sample(n=num_offY).reset_index()
fifty_df = pd.concat([offY_df, offN_df])
# Shuffle the indices
fifty_df = fifty_df.sample(frac=1).reset_index(drop=True)

In [None]:
fifty_df

In [None]:
dev_csv = fifty_df

In [None]:
final_input = []
final_output = []
for i in dev_csv.index:
    final_input.append("[boi] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [eoi]")
    if (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    else:
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [None]:
regular_df = pd.DataFrame()

In [None]:
regular_df["input"] = final_input
regular_df["output"] = final_output

In [None]:
regular_df

In [None]:
regular_df.to_csv("SBIC.dev.3.5050.csv", index=False)

# Scrambled Dataset
- need to overrepresent offensive samples and have more structure with the tokens so the model learns
- create each kind of scramble for each input
#### Most important:
1) generate stereotype given just post
2) generate stereotype given post, group, and offensiveness
3) generate post given group, sterotype, and offensiveness

Making three kinds of scrambles:
1) regular, with new tokens
2) full scramble, with new tokens
3) important scramble, with new tokens

[bos] post [EOS] [offN] [eos] \
[offN] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [ind] [eos]
[offY] [ind] [CLS] [bos] post [eos]

[bos] post [CLS] [offY] [grp/ind] group [ste] stereotype [eos] \
[bos] post [offY] [grp/ind] group [CLS] [ste] stereotype [eos] \
[bos] post [offY] [CLS] [grp/ind] group [ste] stereotype [eos] 

[offY] [grp/ind] group [ste] stereotype [CLS] [bos] post [eos] \
[offY] [grp/ind] group [CLS] post [ste] stereotype [eos] \
[offY] [grp/ind] group [CLS] [ste] stereotype [bos] post [eos] \
[offY] [ste] stereotype [CLS] [bos] post [grp/ind] group [eos]

[offY] [CLS] [bos] post [ste] stereotype [grp/ind] group [eos] 


#### Always keep post in the input, but scramble where group/implication/offY shows up (input or output)

In [87]:
final_input = []
final_output = []
for i in training_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (training_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
#        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
#        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
#        # Case 3: given offensiveness, generate post, stereotype, and group
#        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
#        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]" and training_csv.loc[[i]]["implications"].values[0] == np.NaN):
        # Case 1: given post and offensiveness, generate implication and group
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoo]")
    elif (training_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]" and (training_csv.loc[[i]]["group"].values[0] == np.NaN or training_csv.loc[[i]]["group"].values[0] == "none")):
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
#        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
#                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
#                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
#        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
#        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
#                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoi]")
#        final_output.append("[boo] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
#                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
#        final_input.append("[boi] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
#        final_output.append("[boo] [bos]" + str(training_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
#                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
#                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")   
        # Case 1: given post and offensiveness, generate implication and group
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(training_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(training_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(training_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(training_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [88]:
full_scramble_df = pd.DataFrame()

In [89]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [90]:
full_scramble_df

Unnamed: 0,input,output
0,[boi] [bos] Very much so. The refugee migrants...,[boo] [OffY][ind] [eoo]
1,"[boi] [bos] Sorry lads, this isn't a happy eve...",[boo] [OffY][ind] [eoo]
2,[boi] [bos] I feel very happy to live in Easte...,[boo] [OffN] [eoo]
3,[boi] [bos] We offer protection and welcome re...,[boo] [OffY][ind] [eoo]
4,[boi] [bos] We offer protection and welcome re...,[boo] [OffN] [eoo]
...,...,...
244545,[boi] [bos] there's only one side : the side h...,[boo] [OffN] [eoo]
244546,[boi] [bos] gosh darn to bad we whites deserv...,[boo] [OffN] [eoo]
244547,[boi] [bos] would not be to start with without...,[boo] [OffN] [eoo]
244548,[boi] [bos] jews freak out after german police...,[boo] [OffN] [eoo]


In [91]:
full_scramble_df.to_csv("SBIC.trn.scramble.4.csv", index=False)

In [92]:
final_input = []
final_output = []
for i in dev_csv.index:
    # If it isn't offensive, do the usual (from "regular" scramble)
    if (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffN]"):
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
    # Otherwise, scramble up the input and output according to what is above
    elif (dev_csv.loc[[i]]["whoTarget"].values[0] == 0.0 and dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]"):
        # Case 1: given offensiveness, group, and stereotype, get post
#        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoi]") 
#        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
#        # Case 3: given offensiveness, generate post, stereotype, and group
#        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]") 
#        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] " + "[ind] [eoo]")
        # Case 4: regular 
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + "[ind] [eoo]")
    elif (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]" and dev_csv.loc[[i]]["implications"].values[0] == np.NaN):
        print("here")
        # Case 1: given post and offensiveness, generate implication and group
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoo]")
    elif (dev_csv.loc[[i]]["offensiveYN"].values[0] == "[OffY]" and (dev_csv.loc[[i]]["group"].values[0] == np.NaN or dev_csv.loc[[i]]["group"].values[0] == "none")):
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
    else:
        # Case 1: given offensiveness, group, and stereotype, get post
#        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
#                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
#                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
#        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoo]")
        # Case 2: given offensiveness and group, generate stereotype and post
#        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
#                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoi]")
#        final_output.append("[boo] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
#                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]") 
        # Case 3: given offensiveness, generate post, stereotype, and group
#        final_input.append("[boi] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
#        final_output.append("[boo] [bos]" + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls] [grp] " + 
#                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
#                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")   
        # Case 1: given post and offensiveness, generate implication and group
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" 
                           + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [eoi]")
        final_output.append("[boo] " + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 2: given post and group, generate implication and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")
        # Case 3: given post and implication, generate group and offensiveness
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [eoo]")
        # Case 4: regular
        final_input.append("[boi] " + "[bos] " + str(dev_csv.loc[[i]]["post"].values[0]) + " [cls]" + " [eoi]")
        final_output.append("[boo] " + str(dev_csv.loc[[i]]["offensiveYN"].values[0]) + " [grp] " + 
                          str(dev_csv.loc[[i]]["group"].values[0]) + " [ste] " + 
                          str(dev_csv.loc[[i]]["implications"].values[0]) + " [eoo]")

In [93]:
full_scramble_df = pd.DataFrame()

In [94]:
full_scramble_df["input"] = final_input
full_scramble_df["output"] = final_output

In [95]:
full_scramble_df

Unnamed: 0,input,output
0,[boi] [bos] There are too many costas in this ...,[boo] [OffN] [eoo]
1,[boi] [bos] There is way too many m0ng0l0id ki...,[boo] [OffY][ind] [eoo]
2,[boi] [bos] I called my brother a twat and my ...,[boo] [OffN] [eoo]
3,[boi] [bos] the worst of them are the taaaaaaa...,[boo] [OffY][ind] [eoo]
4,[boi] [bos] the worst of them are the viruses ...,[boo] [OffN] [eoo]
...,...,...
53118,[boi] [bos] : at least 60 white south africans...,[boo] [OffN] [eoo]
53119,[boi] [bos] i'm white thank god [cls] [eoi],[boo] [OffN] [eoo]
53120,[boi] [bos] trump : i wouldn't accept cruz end...,[boo] [OffN] [eoo]
53121,[boi] [bos] trump is a moderate . we need more...,[boo] [OffN] [eoo]


In [96]:
full_scramble_df.to_csv("SBIC.dev.scramble.4.csv", index=False)

# Reading in 50/50 data

In [None]:
preds = pd.read_csv("predictions_3_pt3.csv", index_col=0)

In [None]:
preds["Source Text"] = preds["Source Text"].apply(lambda x: x.replace("[boi] ", "")).apply(lambda x: x.replace(" [eoi]", ""))

preds["Generated Text"] = preds["Generated Text"].apply(lambda x: x.replace("<pad> ", "")).apply(lambda x: x.replace(" <pad>", ""))
preds.columns = ["post", "output"]

In [None]:
pd.set_option('display.width', 1000)
preds["output"].tolist()

In [None]:
y_test = pd.read_csv("SBIC.v2.agg.dev.csv", index_col=0)[:1000]
y_test

In [None]:
merged_df = pd.merge(preds, y_test, on="post")
merged_df["offensiveYN"] = merged_df["offensiveYN"].apply(lambda label: 1 if label == 1 else 0)

In [None]:
merged_df["offYNPredictions"] = merged_df["output"].apply(lambda label: 1 if "OffY" in label else 0)

In [None]:
subset_merged_df = merged_df[["output", "targetMinority", "targetCategory", "targetStereotype", "offensiveYN"]]
subset_merged_df[merged_df.output.str.contains('OffY')].head(50)