# Hate Speech Analysis – Lexical approach
Based on Sentiment Analysis Notebook by (c) Nuno Antonio 2019-2021

### Initial setup

In [8]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji

In [9]:
# Load dataset
base_path = "Data/"
ds = pd.read_excel(base_path + "Tweets_cleaned.xlsx")

In [15]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True, convert_emojis=False, remove_emojis = False):
    cleanedText = []
    for x in (rawText[:]): 
        

        if type(x) != str:
            print("Type: ", str(type(x)))
            x = str(x)

        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

        if convert_emojis:
            procText = convert_emojis(procText)
            procText = convert_emoticons(procText)
        
        if remove_emojis:
            procText = remove_emoji(procText)
            procText = remove_emoticon(procText)

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)

        # If there is a text, add it to the clean text         
        if procText != '':
            cleanedText.append(procText)


    return cleanedText

In [16]:
# Replace emojis and smileys

# Converting emojis to words
# Using both emot and emoji package to cover missing emojis
def convert_emojis(text):
    # from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))

    emoji.demojize(text, delimiters=("", "")) 
    return text
# Converting emoticons to words   
# from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d 
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def remove_emoji(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "")
    return text

def remove_emoticon(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "", text)
    return text



In [17]:
# Tokenize texts
def tokenize_words(texts):
    words_new = []
    for w in (texts[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            words_new.append(w_token)
    return words_new

In [18]:
# Function to recreate text from words
def recreateText(words):
    text_new = []
    for w in (words[:]):
        temp_str = (' ').join(w)
        text_new.append(temp_str)
    return text_new

### Analysis

In [19]:
# Create a dataframe with only the description
# Do not remove additional special characters and not convert to lower as they can make a difference in sentiment
ppText = textPreProcess(ds.text, removeLineBreaks=True, removeNumbers=True, convertToLower=False, convert_emojis=False, remove_emojis=True)
text_tokens = tokenize_words(ppText)

Type:  <class 'int'>


In [62]:
with open("Hate speech data/expandedLexicon.txt") as f:
    contents = f.readlines()

In [63]:
hate_speech_lexicon = [content.strip().split("_")[0] for content in contents]
hate_speech_lexicon = hate_speech_lexicon[0:1237]

In [64]:
hate_speech_lexicon.remove("horrible")
hate_speech_lexicon.remove("horrible")
hate_speech_lexicon.remove("disgusting")
hate_speech_lexicon.remove("revolting")
hate_speech_lexicon.remove("hideous")
hate_speech_lexicon.remove("psychopath")
hate_speech_lexicon.remove("sociopath")
hate_speech_lexicon.remove("despicable")
hate_speech_lexicon.remove("hateful")
hate_speech_lexicon.remove("prank")
hate_speech_lexicon.remove("sexist")
hate_speech_lexicon.remove("cruel")
hate_speech_lexicon.remove("lousy")
hate_speech_lexicon.remove("weird")
hate_speech_lexicon.remove("inhuman")
hate_speech_lexicon.remove("insulting")
hate_speech_lexicon.remove("arrogant")
hate_speech_lexicon.remove("dishonest")
hate_speech_lexicon.remove("insulting")
hate_speech_lexicon.remove("reprehensible")
hate_speech_lexicon.remove("selfish")
hate_speech_lexicon.remove("shameless")
hate_speech_lexicon.remove("ungrateful")
hate_speech_lexicon.remove("cheating")
hate_speech_lexicon.remove("unprofessional")
hate_speech_lexicon.remove("immoral")
hate_speech_lexicon.remove("disrespectful")
hate_speech_lexicon.remove("ingrate")
hate_speech_lexicon.remove("delinquent")
hate_speech_lexicon.remove("shredding")
hate_speech_lexicon.remove("unethical")
hate_speech_lexicon.remove("twisted")
hate_speech_lexicon.remove("immorality")
hate_speech_lexicon.remove("were")
hate_speech_lexicon.remove("utter")
hate_speech_lexicon.remove("plagiarism")
hate_speech_lexicon.remove("decadent")
hate_speech_lexicon.remove("trucker")
hate_speech_lexicon.remove("ruthless")
hate_speech_lexicon.remove("unspeakable")
hate_speech_lexicon.remove("horrendous")
hate_speech_lexicon.remove("hokey")
hate_speech_lexicon.remove("rudeness")
hate_speech_lexicon.remove("hurtful")
hate_speech_lexicon.remove("queer")

hate_speech_lexicon.remove("suicidal")
hate_speech_lexicon.remove("envy")
hate_speech_lexicon.remove("despise")
hate_speech_lexicon.remove("187")
hate_speech_lexicon.remove("scary")
hate_speech_lexicon.remove("selfishness")
hate_speech_lexicon.remove("sore")
hate_speech_lexicon.remove("neurotic")
hate_speech_lexicon.remove("irresponsible")
hate_speech_lexicon.remove("grotesque")
hate_speech_lexicon.remove("insensitive")
hate_speech_lexicon.remove("genderqueer")
hate_speech_lexicon.remove("fearsome")
hate_speech_lexicon.remove("disagreeable")
hate_speech_lexicon.remove("scissor")
hate_speech_lexicon.remove("scissor")
hate_speech_lexicon.remove("immature")
hate_speech_lexicon.remove("scary")
hate_speech_lexicon.remove("ignorant")
hate_speech_lexicon.remove("lazy")
hate_speech_lexicon.remove("arent")
hate_speech_lexicon.remove("awful")
hate_speech_lexicon.remove("useless")
hate_speech_lexicon.remove("irresponsible")
hate_speech_lexicon.remove("paranoid")
hate_speech_lexicon.remove("pointless")
hate_speech_lexicon.remove("ignorant")
hate_speech_lexicon.remove("envy")
hate_speech_lexicon.remove("sickening")

In [65]:
hate_list = []
for tweet in text_tokens:
    flag = False
    for word in hate_speech_lexicon:
        if word in tweet:
            flag = True
    hate_list.append(flag)



In [67]:
# Process sentiment for all sentences
ds['Hate'] = hate_list

In [68]:
ds

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes,mention,hashtag,Hate
0,Doesn't vaccine approval typically take 6 and ...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0,"MatthewDavidH,EricTopol,TheEconomist,US_FDA,la...",,False
1,"You're right, too bad this vaccine doesn't hav...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0,"DharkArk,JoeBiden",,False
2,The vaccine is free…❤️,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0,"theredshift11,POTUS,studentsfordemo",,False
3,. the world needs vaccine access now. Th...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0,"BorisJohnson,JustinTrudeau,POTUS,EUCouncil,Reg...","COVID19,G7,EndThePandemic,COVAX",False
4,But you can get it.\nI remember getting my Mum...,canfixstoopid,355,https://mobile.twitter.com,2021-06-10 19:23:42.000000,0,0,0,"SerendipityOr,Shockwave_Shaun,Ozymandiyaas,Joe...",,False
...,...,...,...,...,...,...,...,...,...,...,...
22529,The country need to check on people who take o...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0,"POTUS,HillaryClinton,Jaemyung_Lee",,False
22530,Why can you not go maskless with out vaccine? ...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0,POTUS,,False
22531,"You say the immunocomprimised ""should discuss ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1,CDCgov,,False
22532,"Well, Kate, not everyone is vaccinated and the...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0,"50treeK8,OregonGovBrown,CDCgov",,False


In [62]:
[]

[]

In [69]:
ds.to_excel(base_path + "Tweets_Hate.xlsx", index=False)