In [1]:
import nltk
import numpy as np
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('genesis')

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /home/ssuo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ssuo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ssuo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package genesis to /home/ssuo/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ssuo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/ssuo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ssuo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[n

True

In [3]:
import pandas as pd

path = "./kurrek.2020.slur-corpus.csv"
data = pd.read_csv(path, sep=",", header = 0, on_bad_lines='skip')

In [4]:
# Make function to check if string contains any element from a string
# https://bobbyhadz.com/blog/python-check-if-string-contains-element-from-list
def is_element_in_string(string, flist):
    if any((match := substring) in string for substring in flist):
        print('The string contains at least one element from the list')
    return match

In [5]:
# Replace slur with [REDACTED], and print
# Make new data list using pandas
# This one is only to print the strings, to avoid printing harmful language. 

path = "./kurrek.2020.slur-corpus.csv"
redacted_data = pd.read_csv(path, sep=",", header = 0, on_bad_lines='skip')

# Iterate for each index in redacted_data, and modify the 'body' to replace the 
# slur with [REDACTED]. The indexes will remain the same as the one found in the original
# document, so this is really to just print out certain rows whilst avoiding printing out the
# harmful language (the slurs).
for index, row in redacted_data.iterrows():
    T1, T2 = str(row['body']), str(row['slur'])
    redacted_data.at[index, 'body'] = T1.lower().replace(T2, "[REDACTED]")

## Task 1

In [None]:
"""By constructing a dataframe of posts assigned to the same category, suggest a script that outputs the
vocabulary set of each category, the size of the vocabulary, the total number of tokens, the average
number of tokens per post and its standard deviation, the average number of pronouns per post and the
associated standard deviation, the ten most frequent tokens in each category, excluding the stopword list.
Represent the statistical result in a clear table and discuss whether some parameters are most relevant to
discriminate a given category."""

In [87]:
# The following is a list of categories, and the number of frequencies
"""
DEG 	Derogatory 	20531
NDG 	Non Derogatory Non Appropriative 	16729
HOM 	Homonym 	1998
APR 	Appropriative 	553
CMP 	Noise 	189
"""
# Create dictionary variables to store data stored about each category
def dict_template():
    temp_dict = {"vocab_set"   : [],
                 "vocab_size"  : 0,
                 "total_tokens": 0,
                 "avg_tokens"  : {"num_tokens": 0, "standard_dev": 0},
                 "avg_pronouns": {"num_tokens": 0, "standard_dev": 0},
                 "freq_words"  : []
           }
    return temp_dict

deg_dict = dict_template()
ndg_dict = dict_template()
hom_dict = dict_template()
apr_dict = dict_template()
cmp_dict = dict_template()

categ_dict = {0: deg_dict,
              1: ndg_dict,
              2: hom_dict,
              3: apr_dict,
              4: cmp_dict
             }
categ_string_list = {0: "DEG",
                     1: "NDG",
                     2: "HOM",
                     3: "APR",
                     4: "CMP"
             }

In [98]:
# Function to find the vocabulary set per category 
# Flatten out list of lists for vocab set
# https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists
# https://stackoverflow.com/questions/10677020/real-word-count-in-nltk
import itertools
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def find_vocab_set(categ):
    # Iterate each index
    total_tokens = 0
    total_bodies = 0
    for index, row in data.iterrows():
        # Create body and label integer and string
        body, label = str(row['body']), str(row['gold_label'])
        # Check if the label is deg, ndg, hom, apr, or cmp
        if label == categ_string_list[categ]:
            # Print tokenized body
            categ_dict[categ]["vocab_set"].extend(word_tokenize(body))
            # Update total tokens
            total_tokens += len(word_tokenize(body))
            # Update total bodies
            total_bodies += 1
    # Remove doubles
    categ_dict[categ]["vocab_set"] = list(set(categ_dict[categ]["vocab_set"]))
    # Update vocab size
    categ_dict[categ]["vocab_size"] = len(categ_dict[categ]["vocab_set"])
    # Update total number of tokens
    categ_dict[categ]["total_tokens"] = total_tokens
    # Update avg tokens:
    categ_dict[categ]["avg_tokens"]["num_tokens"] = total_tokens / total_bodies
        
find_vocab_set(0)
find_vocab_set(1)
find_vocab_set(2)
find_vocab_set(3)
find_vocab_set(4)

In [101]:
for key, value in categ_dict.items():
    print(f"Info regarding the {categ_string_list[key]} category:")
    print(f"Vocab size:     {value["vocab_size"]}")
    print(f"Total size:     {value["total_tokens"]}")
    print(f"Average tokens: {value["avg_tokens"]["num_tokens"]}\n")

Info regarding the DEG category:
Vocab size:     36315
Total size:     659069
Average tokens: 32.10272771553824

Info regarding the NDG category:
Vocab size:     31418
Total size:     804166
Average tokens: 48.07592515095355

Info regarding the HOM category:
Vocab size:     10555
Total size:     112531
Average tokens: 56.32182182182182

Info regarding the APR category:
Vocab size:     3906
Total size:     23333
Average tokens: 42.19349005424955

Info regarding the CMP category:
Vocab size:     2052
Total size:     5604
Average tokens: 29.650793650793652

