# DATA CLEANING AND SELECTION

### Initialise packages

In [1]:
import pandas as pd
import re

from html import unescape

### Import data

In [2]:
df = pd.read_csv('./0_data/raw/davidson2017.csv', index_col=0)

# Select cases annotated by three annotators
print('initial number of cases', df.shape[0])
df = df[df['count']==3].copy()
print('cases annotated by three annotators', df.shape[0])

initial number of cases 24783
cases annotated by three annotators 22807


### Clean data

In [3]:
# drop unnecessary columns
df.drop(columns=['count'], inplace=True, errors='ignore')

# rename columns
df.rename(columns={"class": "label_multi", "tweet": "text", 'hate_speech': 'n_hateful', 'offensive_language': 'n_offensive', 'neither': 'n_neither'},
          inplace=True, errors='ignore')

In [4]:
# write out label names
id_to_name = {0: 'hateful', 1: 'offensive', 2: 'neither'}

df['label_multi'] = df.label_multi.apply(lambda x: id_to_name[x])

In [5]:
# create binary label column
df['label_binary'] = df.label_multi.apply(lambda x: 'non-hateful' if x!='hateful' else 'hateful')

In [6]:
# create column indicating whether annotation was unanimous

def check_unanimity(row):
    if max(row.n_hateful, row.n_offensive, row.n_neither)==3:
        return True
    else:
        return False

df['unanimous']= df.apply(lambda row: check_unanimity(row), axis=1)

In [7]:
def clean_text(text):
    
    # convert HTML codes
    text = unescape(text)
            
    # replace mentions and URLs with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)

    # remove punctuation and RT's at beginning of string (quirk in Davidson data)
    text = text.lstrip('!')
    text = text.replace('RT', '')
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # strip whitespace
    text = text.strip()

    return text
    
df['text_clean'] = df.text.apply(lambda x: clean_text(x))
df

Unnamed: 0,n_hateful,n_offensive,n_neither,label_multi,text,label_binary,unanimous,text_clean
0,0,0,3,neither,!!! RT @mayasolovely: As a woman you shouldn't...,non-hateful,True,[USER]: As a woman you shouldn't complain abou...
1,0,3,0,offensive,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,non-hateful,True,[USER]: boy dats cold...tyga dwn bad for cuffi...
2,0,3,0,offensive,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,non-hateful,True,[USER] Dawg!!!! [USER]: You ever fuck a bitch...
3,0,2,1,offensive,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,non-hateful,False,[USER]: [USER] she look like a tranny
5,1,2,0,offensive,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just...",non-hateful,False,"""[USER]: The shit just blows me..claim you so ..."
...,...,...,...,...,...,...,...,...
25290,2,1,0,hateful,you're such a retard i hope you get type 2 dia...,hateful,False,you're such a retard i hope you get type 2 dia...
25291,0,2,1,offensive,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,non-hateful,False,you's a muthaf***in lie “[USER]: [USER] [USER]...
25292,0,1,2,neither,"you've gone and broke the wrong heart baby, an...",non-hateful,False,"you've gone and broke the wrong heart baby, an..."
25294,0,3,0,offensive,young buck wanna eat!!.. dat nigguh like I ain...,non-hateful,True,young buck wanna eat!!.. dat nigguh like I ain...


### Export clean data to csv

In [8]:
# create copy of df with relevant columns in more readable order
export_df = df[['text_clean', 'label_binary', 'label_multi', 'unanimous', 'n_hateful', 'n_offensive', 'n_neither']].copy()

In [9]:
# write to csv
export_df.to_csv('./0_data/clean/davidson2017_clean.csv', index=False)