# Setting up the notebook with imports



In [6]:
# Setup
!pip install -q wordcloud

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger') 

import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string
from wordcloud import WordCloud
import plotly.express as px

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [7]:
# Constants
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'} 
POS_TYPES = list(DI_POS_TYPES.keys())

# Constraints on tokens
MIN_STR_LEN = 3
RE_VALID = '[a-zA-Z]'

Load Corpus from UC Berkley dataset repo

In [8]:
# Get our UC Berkley dataset
!pip install datasets
import datasets
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
df = dataset['train'].to_pandas()
  
# Display
print("df:")
print(df.head().to_string())
print(df.describe())

# Convert quotes to list
li_quotes = df['text'].tolist()
print()
print("len(li_quotes):", len(li_quotes))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




  0%|          | 0/1 [00:00<?, ?it/s]

df:
   comment_id  annotator_id  platform  sentiment  respect  insult  humiliate  status  dehumanize  violence  genocide  attack_defend  hatespeech  hate_speech_score                                                                                                                                                                                                                                                            text  infitms  outfitms  annotator_severity  std_err  annotator_infitms  annotator_outfitms  hypothesis  target_race_asian  target_race_black  target_race_latinx  target_race_middle_eastern  target_race_native_american  target_race_pacific_islander  target_race_white  target_race_other  target_race  target_religion_atheist  target_religion_buddhist  target_religion_christian  target_religion_hindu  target_religion_jewish  target_religion_mormon  target_religion_muslim  target_religion_other  target_religion  target_origin_immigrant  target_origin_migrant_worker  target_origin

In [9]:
df.sample(10)

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
57843,7914,578,0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
40052,44778,10307,3,3.0,3.0,3.0,3.0,2.0,3.0,0.0,...,False,False,False,False,False,False,False,False,True,False
39601,39581,4394,2,4.0,4.0,3.0,3.0,3.0,3.0,2.0,...,False,False,False,False,True,False,False,True,False,False
107938,20020,8495,1,4.0,4.0,4.0,3.0,4.0,4.0,2.0,...,False,False,False,False,False,False,False,False,True,False
100466,20004,7265,1,4.0,4.0,4.0,4.0,3.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False
99064,20001,2102,1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,True,False,False,False
34317,9402,943,0,3.0,3.0,3.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
88850,18417,7818,0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,...,False,False,False,False,True,False,False,False,True,False
104556,20007,5852,1,4.0,4.0,4.0,4.0,4.0,4.0,3.0,...,False,False,False,False,True,False,False,False,True,False
111346,20016,9346,1,4.0,4.0,4.0,3.0,3.0,0.0,3.0,...,False,False,False,False,False,False,False,False,True,False


In [10]:
# names = list(data.keys())

# values = list(data.values())

# plt.rcParams["figure.figsize"] = (8,4)

# plt.rc('xtick', labelsize=20) 
# plt.rc('ytick', labelsize=20) 
# plt.rcParams.update({'font.size': 20})

# # Set Outer Color
   
# plt.figure(facecolor='white') 

# # Plot the graph

# plt.bar(range(len(data)), values, tick_label=names, color = "white")

# plt.xlabel("Label", fontweight='bold')

# plt.ylabel("Percentage", fontweight='bold') 

# ax = plt.axes() 

# # Set Inner Color

# ax.set_facecolor('black') 

# # Display the graph

# plt.show()


#### For our corpus a 1 represents text that is considered hateful and a 0 is non-hatespeech

## Tokenize sentences and words, remove stopwords, use stemmer & lemmatizer

First, a note on the difference between Stemming vs Lemmatization:

* Stemming: Trying to shorten a word with simple regex rules

* Lemmatization: Trying to find the root word with linguistics rules (with the use of regex rules)

In [None]:
# Get stopwords, stemmer and lemmatizer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Remove accents function
def remove_accents(data):
    return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

# Process all quotes
li_tokens = []
li_token_lists = []
li_lem_strings = []

for i,text in enumerate(li_quotes):
    # Tokenize by sentence, then by lowercase word
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = [token for token in tokens if token != 'url'] # find that cleaning did not account for url token

    # Process all tokens per quote
    li_tokens_quote = []
    li_tokens_quote_lem = []
    for token in tokens:
        # Remove accents
        t = remove_accents(token)

        # Remove punctuation
        t = str(t).translate(string.punctuation)
        li_tokens_quote.append(t)
        
        # Add token that represents "no lemmatization match"
        li_tokens_quote_lem.append("-") # this token will be removed if a lemmatization match is found below

        # Process each token
        if t not in stopwords:
            if re.search(RE_VALID, t):
                if len(t) >= MIN_STR_LEN:
                    # Note that the POS (Part Of Speech) is necessary as input to the lemmatizer 
                    # (otherwise it assumes the word is a noun)
                    pos = nltk.pos_tag([t])[0][1][:2]
                    pos2 = 'n'  # set default to noun
                    if pos in DI_POS_TYPES:
                      pos2 = DI_POS_TYPES[pos]
                    
                    stem = stemmer.stem(t)
                    lem = lemmatizer.lemmatize(t, pos=pos2)  # lemmatize with the correct POS
                    
                    if pos in POS_TYPES:
                        li_tokens.append((t, stem, lem, pos))

                        # Remove the "-" token and append the lemmatization match
                        li_tokens_quote_lem = li_tokens_quote_lem[:-1] 
                        li_tokens_quote_lem.append(lem)

    # Build list of token lists from lemmatized tokens
    li_token_lists.append(li_tokens_quote)
    
    # Build list of strings from lemmatized tokens
    str_li_tokens_quote_lem = ' '.join(li_tokens_quote_lem)
    li_lem_strings.append(str_li_tokens_quote_lem)
    
# Build resulting dataframes from lists
df_token_lists = pd.DataFrame(li_token_lists)

print("df_token_lists.head(5):")
print(df_token_lists.head(5).to_string())

# Replace None with empty string
for c in df_token_lists:
    if str(df_token_lists[c].dtype) in ('object', 'string_', 'unicode_'):
        df_token_lists[c].fillna(value='', inplace=True)

df_lem_strings = pd.DataFrame(li_lem_strings, columns=['lem quote'])

print()
print("")
print("df_lem_strings.head():")
print(df_lem_strings.head().to_string())

In [None]:
df_lem_strings['text_from_lemma'] = df_lem_strings['lem quote'].apply(lambda x: x.replace("-", '') )

In [None]:
df['hatespeech'] = (df['hatespeech']).apply(lambda x: 1 if x > 0 else 0)

In [None]:
df['hatespeech'].value_counts(normalize = True)

In [None]:
df = df_lem_strings.join(df['hatespeech'])

In [None]:
df.sample(10)

In [None]:
#df.to_csv("binary_hatespeech_cleaned.csv")

## Process results, find the most popular lemmatized words and group results by Part of Speech (POS)

In [None]:
# Add counts
print("Group by lemmatized words, add count and sort:")
df_all_words = pd.DataFrame(li_tokens, columns=['token', 'stem', 'lem', 'pos'])
df_all_words['counts'] = df_all_words.groupby(['lem'])['lem'].transform('count')
df_all_words = df_all_words.sort_values(by=['counts', 'lem'], ascending=[False, True]).reset_index()

print("Get just the first row in each lemmatized group")
df_words = df_all_words.groupby('lem').first().sort_values(by='counts', ascending=False).reset_index()
print("df_words.head(10):")
print(df_words.head(10))

## Top 10 words per Part Of Speech (POS)

In [None]:
df_words_trunc = df_words[['lem', 'pos', 'counts']].head(200)
pos_dict = {}
for v in POS_TYPES:
    df_pos = df_words[df_words['pos'] == v]
    print()
    print("POS_TYPE:", v)
    pos_dict[v] = df_pos
    print(df_pos.head(10).to_string())


## Creating frequency graphs for top 30 words for each part of speech in our corpus

In [None]:
pos_keys = pos_dict.keys()
taggers = {'JJ': "Adjective", "RB": "Adverb", "CC": "Conjunction", "NN": "Noun", "VB": "Verb"}

for key in pos_keys:
#y='Words in our Corpus', x='Total Count', color = 'Words in our Corpus', orientation='h'
  fig = px.bar(pos_dict[key][:20], y='lem', x='counts', orientation='h',
                hover_data=['pos', 'token'], color = 'token')
  
  fig.update_layout(yaxis={'categoryorder':'total ascending'})
  
  fig.update_layout(title= f"Top 20 Most Frequent {taggers[key]}s in our Corpus")

  fig.show()  

## Sorted frequency plot for all words

In [None]:
li_token_lists_flat = [y for x in li_token_lists for y in x]  # flatten the list of token lists to a single list
print("li_token_lists_flat[:10]:", li_token_lists_flat[:10])

di_freq = nltk.FreqDist(li_token_lists_flat)
del di_freq['']
li_freq_sorted = sorted(di_freq.items(), key=lambda x: x[1], reverse=True)  # sorted list
print(li_freq_sorted)
    
di_freq.plot(30, cumulative=False)

In [None]:
# Creating a dicitonary of words and their counts in order to plot their frequencies

words = [li_freq_sorted[i][0] for i in range(len(li_freq_sorted))][:10] 
counts = [li_freq_sorted[i][1] for i in range(len(li_freq_sorted))][:10]

frequency_dict = {'Words in our Corpus': words, 'Total Count': counts}

In [None]:
fig = px.bar(frequency_dict, y='Words in our Corpus', x='Total Count', color = 'Words in our Corpus', orientation='h')

fig.update_layout(title="Top 10 Most Frequent Words Before Removing Stop Words")

fig.show()

## ## Sorted frequency plot for Lemmatized  words after removing stopwords

In [None]:
li_lem_words = df_all_words['lem'].tolist()
di_freq2 = nltk.FreqDist(li_lem_words)
li_freq_sorted2 = sorted(di_freq2.items(), key=lambda x: x[1], reverse=True)  # sorted list
print(li_freq_sorted2)
    
di_freq2.plot(30, cumulative=False)

In [None]:
words = [li_freq_sorted2[i][0] for i in range(len(li_freq_sorted2))][:10] 
counts = [li_freq_sorted2[i][1] for i in range(len(li_freq_sorted2))][:10]

frequency_dict2 = {'Words in our Corpus': words, 'Total Count': counts}

In [None]:
fig = px.bar(frequency_dict2, y='Words in our Corpus', x='Total Count', color = 'Words in our Corpus', orientation='h')

fig.update_layout(title="Top 10 Most Frequent Words After Removing Stop Words")

fig.show()

## Creating word bubbles for our cropus


In [None]:
# change the value to black
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%, 1%)")

# set the wordcloud background color to white
# set max_words to 1000
# set width and height to higher quality, 3000 x 2000
wordcloud = WordCloud(background_color="white", width=3000, height=2000, 
                      max_words=500).generate_from_frequencies(di_freq)

# set the word color to black
wordcloud.recolor(color_func = black_color_func)
# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('stop_words.png')

In [None]:
wordcloud = WordCloud(background_color="white", width=3000, height=2000, 
                      max_words=500).generate_from_frequencies(di_freq2)

wordcloud.recolor(color_func = black_color_func)
# set the figsize
plt.figure(figsize=[15,10])
# plot the wordcloud
plt.imshow(wordcloud, interpolation="bilinear")
# remove plot axes
plt.axis("off")
# save the image
plt.savefig('key_words.png')

In [None]:
original_df = dataset['train'].to_pandas()

df_cleaned = df.join(original_df['text'])

df_cleaned.shape

In [None]:
#df_cleaned.to_csv("hate_speech_and_processed_lemmas.csv")