# MHS Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('mhs.parquet')
df.head()

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


In [3]:
df = df[['text', 'hate_speech_score']]

In [4]:
df.head()

Unnamed: 0,text,hate_speech_score
0,Yes indeed. She sort of reminds me of the elde...,-3.9
1,The trans women reading this tweet right now i...,-6.52
2,Question: These 4 broads who criticize America...,0.36
3,It is about time for all illegals to go back t...,0.26
4,For starters bend over the one in pink and kic...,1.54


### Labelling Data
Based on the dataset documentation, hate_speech_score is a continuous hate speech measure, where higher = more hateful and lower = less hateful. > 0.5 is approximately hate speech, < -1 is counter or supportive speech, and -1 to +0.5 is neutral or ambiguous.

In [5]:
def label_data(hate_speech_score):
    if hate_speech_score > 0.5:
        return 1
    else:
        return 0
df['HS'] = df['hate_speech_score'].apply(label_data)
df = df.drop(['hate_speech_score'], axis=1)
df.head()

Unnamed: 0,text,HS
0,Yes indeed. She sort of reminds me of the elde...,0
1,The trans women reading this tweet right now i...,0
2,Question: These 4 broads who criticize America...,0
3,It is about time for all illegals to go back t...,0
4,For starters bend over the one in pink and kic...,1


In [6]:
import re

def removeURLs(tweet):
    return re.sub(r'https?://[^ ]+', '', tweet)

def removeMentions(tweet):
    return re.sub(r'@[^ ]+', '', tweet)

def removeHashtags(tweet):
    return re.sub(r'#', '', tweet)

def wordNormalization(tweet):
    return re.sub(r'([A-Za-z])\1{2,}', r'\1', tweet)

def removeNonAlphabetic(tweet):
    tweet = re.sub(r'&[^ ]+', '', tweet)
    return re.sub(r'[^A-Za-z ]', '', tweet)

def lowerCasing(tweet):
    return tweet.lower()

def removeExtraSpaces(tweet):
    tweet = re.sub(' +', ' ', tweet)
    tweet = tweet.strip()
    return tweet

def removeRetweetChar(tweet): # Remove rt
    return re.sub(r'\brt\b', '', tweet)

In [7]:
def tweetPreprocessingPipeline(tweet):
    tweet = removeURLs(tweet)
    tweet = removeMentions(tweet)
    tweet = removeHashtags(tweet)
    tweet = wordNormalization(tweet)
    tweet = removeNonAlphabetic(tweet)
    tweet = lowerCasing(tweet)
    tweet = removeRetweetChar(tweet)
    tweet = removeExtraSpaces(tweet)
    return tweet

In [8]:
df['text'] = df['text'].apply(tweetPreprocessingPipeline)
df.head()

Unnamed: 0,text,HS
0,yes indeed she sort of reminds me of the elder...,0
1,the trans women reading this tweet right now i...,0
2,question these broads who criticize america wh...,0
3,it is about time for all illegals to go back t...,0
4,for starters bend over the one in pink and kic...,1


In [9]:
df.to_csv('preprocessed_data.csv', index=False)