In [1]:
import pandas as pd
import re
import numpy as np
from keybert import KeyBERT

Read the dataset

In [26]:
data = pd.read_csv("dataset/data.csv", sep = ";")

In [27]:
text = data["text"].tolist()
labels = data["label"].tolist()

Preprocess the dataset

In [28]:
def preprocess(A, labels):
    B = []
    labels_new = []
    for i in range(len(A)):

        text = A[i]
        # remove old style retweet text "RT"
        text = re.sub(r'^RT[\s]+', '', text)

        # remove hyperlinks
        text= re.sub(r'https?:\/\/.*[\r\n]*', '', text)

        # remove hashtags
        # only removing the hash # sign from the word
        text = re.sub(r'#', '', text)

        # remove tagging @
        text = re.sub(r"(?:\@|https?\://)\S+", "", text)

        # remove new line \n
        text = re.sub("\n", "", text)
        
        # remove numbers
        text = text.replace("[0-9]", " ")
    
        # to lower case
        text = text.lower()

        # remove zero length tweets
        if(len(text) == 0 ):
            continue

        B.append(text)
        labels_new.append(labels[i])
    return B, labels_new

In [29]:
texts, labels = preprocess(text, labels)
len(texts)

57581

In [30]:
texts = np.array(texts)
labels = np.array(labels)

In [9]:
model = KeyBERT('distilbert-base-nli-mean-tokens')

In [10]:
def get_keywords(doc, model):
    keywords = model.extract_keywords(doc)[:3]
    return keywords

In [7]:
# choose the label
np.unique(labels)

array(['abusive', 'benevolent', 'cyberbulling', 'hate', 'hateful',
       'identity', 'insult', 'obscene', 'offensive', 'profane', 'racism',
       'sexism', 'spam', 'threat', 'toxic'], dtype='<U12')

In [120]:
label = "toxic"
idx = labels == label

In [121]:
texts_f = texts[idx]

In [122]:
keywords = []

In [123]:
for i in range(len(texts_f)):
    keywords.extend(get_keywords(texts_f[i], model))

In [124]:
keywords = [x[0] for x in keywords]

In [125]:
w, c = np.unique(keywords, return_counts = True)
w = w[np.argsort(c)][-6::]
c = c[np.argsort(c)][-6::]
print(f"Keywords: {w[::-1]}")
print(f"Counts: {c[::-1]}")

Keywords: ['fuck' 'wikipedia' 'bitch' 'fucking' 'suck' 'gay']
Counts: [2108 1605  842  808  562  523]


In [None]:
"""
abusive: fucking, idiot, bitch, hate, fuck
benevolent: women, womensday, sassy, adaywithoutwomen, woman
cyberbullying: riot, troll, hacking, trolls, hacker
hate: trumpisatraitor, doctorsfightback, shameonicc, borisjohnsonshouldnotbepm, trump
hateful: hate, trump, idiot, nigga, fucking
identity: gay, fuck, nigger, bitch, fucking
insult: fuck, wikipedia, bitch, fucking, suck
obscene: fuck, wikipedia, bitch, fucking, suck
offensive: trumpisatraitor, fucktrump, trump, murderer, rapist
profane: fucktrump, fuck, dickhead, trump, douchebag
racism: coon, white, black, terror, fuck
sexism: sexist, women, feminazi, girls, kat
spam: video, new, 2017, liked, free
threat: kill, die, fuck, bitch, rape, death
toxic: fuck, wikipedia, bitch, fucking, suck

"""

Get possible keyword candidates

In [16]:
"""
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(texts[idx].tolist())
candidates = count.get_feature_names()
"""

'\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nn_gram_range = (1, 1)\nstop_words = "english"\n\n# Extract candidate words/phrases\ncount = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(texts[idx].tolist())\ncandidates = count.get_feature_names()\n'

In [17]:
#len(candidates)

Load the model

In [18]:
"""
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

#model = SentenceTransformer('distilbert-base-nli-mean-tokens', "cuda")

model.max_seq_length = 512
"""

'\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer(\'bert-base-nli-mean-tokens\')\n\n#model = SentenceTransformer(\'distilbert-base-nli-mean-tokens\', "cuda")\n\nmodel.max_seq_length = 512\n'

In [19]:
"""
doc_embedding = model.encode(texts, show_progress_bar = True)
candidate_embeddings = model.encode(candidates)
"""

'\ndoc_embedding = model.encode(texts, show_progress_bar = True)\ncandidate_embeddings = model.encode(candidates)\n'

In [20]:
#candidate_embeddings.shape

In [21]:

#def get_keywords(doc_embedding, candidate_embeddings, top_n = 3):
#    distances = cosine_similarity(test, candidate_embeddings)
#    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
#    return keywords

Compute the similarity between the embeddings of tweets and keyword candidates

In [22]:
"""
from sklearn.metrics.pairwise import cosine_similarity

keyword_matrix = cosine_similarity(doc_embedding, candidate_embeddings)
"""

'\nfrom sklearn.metrics.pairwise import cosine_similarity\n\nkeyword_matrix = cosine_similarity(doc_embedding, candidate_embeddings)\n'

Extract keywords for each tweet

In [23]:
#def get_keywords(i, keyword_matrix, candidates, top_n = 3):
#    keywords = [candidates[index] for index in keyword_matrix[i,:].argsort()[-top_n:]]
#    return keywords

In [24]:
"""
keywords = {}
for i in range(doc_embedding.shape[0]):
    x = keywords.get(labels[i])
    if x is None:
        x = []
        
    # extract keywords
    keys = get_keywords(i, keyword_matrix, candidates)
    x.extend(keys)
    
    keywords[labels[i]] = x
"""

'\nkeywords = {}\nfor i in range(doc_embedding.shape[0]):\n    x = keywords.get(labels[i])\n    if x is None:\n        x = []\n        \n    # extract keywords\n    keys = get_keywords(i, keyword_matrix, candidates)\n    x.extend(keys)\n    \n    keywords[labels[i]] = x\n'

Get most common keywords for each label

In [25]:
"""
import numpy as np

remove = "yamla_likes_to_fuck_babies_up_the_ass_wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww"
top_n = 10
for k,v in keywords.items():
    words, counts = np.unique(v, return_counts = True)
    idx = np.argsort(counts)
    words = words[idx]
    counts = counts[idx]
    keys = words[-top_n:]
    if keys[-1] == remove:
        keys = np.delete(keys, -1)
    print(f"{k}: {keys}")
"""

'\nimport numpy as np\n\nremove = "yamla_likes_to_fuck_babies_up_the_ass_wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww"\ntop_n = 10\nfor k,v in keywords.items():\n    words, counts = np.unique(v, return_counts = True)\n    idx = np.argsort(counts)\n    words = words[idx]\n    counts = counts[idx]\n    keys = words[-top_n:]\n    if keys[-1] == remove:\n        keys = np.delete(keys, -1)\n    print(f"{k}: {keys}")\n'