# Detecting Hate Speech from Tweets

In [8]:
import pandas as pd

tweets = pd.read_csv('../../datasets/av_free_course/hate_speech_twitter.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [9]:
import re
import unicodedata

def clean_text(text):
    text = text.lower() # Convert to Lowercase
    text = re.sub('rt', '', text) # Removing RT 
    text = re.sub(r'@\w+\s|@\w+:\s|@\w+', '', text) # Removing @mentions
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', text) # Removing HTML Tags
    text = re.sub(r'http\S+|https?://\S+|www\.\S+', '', text) # Removing URLs
    text = "".join(char for char in text if unicodedata.category(char)[0]!="C") # removing Control Characters
    text = re.sub('[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]+','',text) #Removing Emojis
    text = re.sub(r'[^\w\s]','', str(text)) # Removing Punctuations
    text = re.sub(r'[^a-zA-Z\']', ' ', text) # Removing Numbers

    return text

tweets['cleaned_tweet'] = tweets.tweet.apply(lambda x: clean_text(x))

In [10]:
tweets.head(10)

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,5,0,factsguide: society now #motivation,factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams ca...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so gr


## Feature Engineering

In [11]:
# Word Frequency
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

In [12]:
def word_freq_gen(text):
    word_list = []
    for word in text.split():
        word_list.extend(word)
    
    word_freq = pd.Series(word_list).value_counts()

    word_freq = word_freq.drop(STOP_WORDS, errors = 'ignore')

    return word_freq

word_freq = word_freq_gen(tweets.cleaned_tweet.str)
rare_100 = word_freq[-100:]

In [13]:
word_freq

love               308
day                246
trump              213
happy              208
will               191
                  ... 
benidorm             1
si                   1
streetautopia        1
azmilkproducers      1
dipshit              1
Length: 11867, dtype: int64

In [14]:
rare_100

ofcourse           1
imnotold           1
barrelno           1
apples             1
heroku             1
                  ..
benidorm           1
si                 1
streetautopia      1
azmilkproducers    1
dipshit            1
Length: 100, dtype: int64

In [15]:
tweets['word_count'] = tweets.cleaned_tweet.str.split().apply(lambda x: len(x))
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_tweet,word_count
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...,17
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,17
2,3,0,bihday your majesty,bihday your majesty,3
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,12
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4


In [17]:
# Negative words
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not', 'cant', 'wont'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0
tweets['any_neg'] = tweets.cleaned_tweet.str.split().apply(lambda x: any_neg(x))
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_tweet,word_count,any_neg
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...,17,0
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,17,1
2,3,0,bihday your majesty,bihday your majesty,3,0
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,12,0
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0


In [19]:
# Rare Words
def any_rare(words):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

tweets['any_rare'] = tweets.cleaned_tweet.str.split().apply(lambda x: any_rare(x))
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_tweet,word_count,any_neg,any_rare
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...,17,0,0
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,17,1,0
2,3,0,bihday your majesty,bihday your majesty,3,0,0
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,12,0,0
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0


In [20]:
# Check questions
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

tweets['is_question'] = tweets.cleaned_tweet.str.split().apply(lambda x: is_question(x))
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_tweet,word_count,any_neg,any_rare,is_question
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...,17,0,0,1
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,17,1,0,0
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,12,0,0,0
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0


In [22]:
# Character Count
tweets['char_count'] = tweets.cleaned_tweet.apply(lambda x: len(x))
tweets.head()

Unnamed: 0,id,label,tweet,cleaned_tweet,word_count,any_neg,any_rare,is_question,char_count
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...,17,0,0,1,94
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,17,1,0,0,104
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in u...,12,0,0,0,58
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,37


In [23]:
# Train Test Split
from sklearn.model_selection import train_test_split

X = tweets[['word_count', 'any_neg', 'any_rare', 'is_question', 'char_count']]
y = tweets.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=27)

In [24]:
# Model Training
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model = model.fit(X_train, y_train)

pred = model.predict(X_test)

In [25]:
# Model Evaluation
from sklearn.metrics import accuracy_score
print('Accuracy Score: ', accuracy_score(y_test, pred)*100, '%')

Accuracy Score:  42.66666666666667 %
