# LABELLED DATA EXPLORATION

In [1]:
import pandas as pd
import csv

# ANNOTATED DATA

In [2]:
import_df = pd.read_csv('./Data/GabHateCorpus_annotations.tsv', sep='\t')

In [3]:
# write relevant columns to df and rename
kennedy_df = import_df[['ID', 'Annotator', 'Text', 'Hate']].copy()
kennedy_df.rename(columns={'ID':'case_id', 'Annotator': 'ann_id', 'Text': 'text', 'Hate': 'label'}, inplace=True)
kennedy_df

Unnamed: 0,case_id,ann_id,text,label
0,27044,4,Ah the PSYOPS antifa crew is back. That’s how ...,0
1,27044,15,Ah the PSYOPS antifa crew is back. That’s how ...,0
2,27044,10,Ah the PSYOPS antifa crew is back. That’s how ...,0
3,27044,8,Ah the PSYOPS antifa crew is back. That’s how ...,0
4,27045,4,Get the new Android app update released today ...,0
...,...,...,...,...
86524,9188,6,"He thinks only peons own guns, you're supposed...",1
86525,9188,3,"He thinks only peons own guns, you're supposed...",0
86526,9222,11,America must stop funding the United Nations.,0
86527,9222,3,America must stop funding the United Nations.,0


In [4]:
# number of documents
len(pd.unique(kennedy_df.case_id))

27665

In [5]:
# number of annotators
len(pd.unique(kennedy_df.ann_id))

18

In [6]:
# number of annotators per document
kennedy_df.groupby(kennedy_df.groupby('case_id').text.count()).text.count()

# issues:
# two annotators is very little
# even number of annotators leads to ties

text
2.0      119
3.0    24216
4.0     3110
5.0      165
6.0       19
7.0       24
8.0       12
Name: text, dtype: int64

In [7]:
# calculate and merge proportion of votes for hateful class for each case id
agreement = kennedy_df.groupby('case_id').label.mean()
agreement.name = 'prop_hateful'
kennedy_df = kennedy_df.merge(agreement, on='case_id')

In [8]:
# drop redundant label and annotator id columns, then drop duplicates
kennedy_df.drop(columns=['ann_id', 'label'], errors = 'ignore', inplace=True)
kennedy_df.drop_duplicates(inplace=True)
kennedy_df.reset_index(drop=True)

Unnamed: 0,case_id,text,prop_hateful
0,27044,Ah the PSYOPS antifa crew is back. That’s how ...,0.000000
1,27045,Get the new Android app update released today ...,0.000000
2,27046,Study: Google Pushes Liberal News in Top 5 Sea...,0.000000
3,27048,#Breaking: Claiming bias against conservatives...,0.250000
4,27049,"“These [companies] are run by sociopaths,” he ...",0.250000
...,...,...,...
27660,7132,yeah we wont be deleting these accounts so neo...,0.666667
27661,9078,"Wow, watch this yarmie wearing Jew brag about ...",0.333333
27662,9167,I'm available as a Common Sense Consultant sin...,0.000000
27663,9188,"He thinks only peons own guns, you're supposed...",0.333333


In [9]:
# number of cases by proportion of agreement
kennedy_df.groupby('prop_hateful').text.count()

# note the 226 ties!

prop_hateful
0.000000    19852
0.125000        1
0.142857        3
0.200000       30
0.250000      660
0.285714        2
0.333333     4517
0.400000       35
0.428571        2
0.500000      226
0.571429        1
0.600000       25
0.625000        2
0.666667     1605
0.714286        2
0.750000      109
0.800000        8
0.833333        1
1.000000      584
Name: text, dtype: int64

In [10]:
# create label column from prop_hateful by majority vote
def return_label(prop):
    if prop > 0.5:
        return 1
    else:
        return 0

kennedy_df['label']=kennedy_df.prop_hateful.apply(lambda x: return_label(x))

In [11]:
# proportion of hateful cases
print('proportion of hateful cases: {:.1%}'.format(kennedy_df[kennedy_df.label==1].text.count()/kennedy_df.shape[0]))

proportion of hateful cases: 8.4%


In [12]:
kennedy_df.case_id.max()

27664

In [13]:
# export to csv
kennedy_df.to_csv('./Data/kennedy_clean.csv', index=False)

In [None]:
# TO DO: duplicate text handling
kennedy_df[kennedy_df.duplicated(subset='text', keep=False)].sort_values('text')