# KeyBERT

This notebook leverages KeyBERT to use the BERT embeddings to extract the keywords from the documents.

In [1]:
import pandas as pd
import re
import numpy as np
from keybert import KeyBERT

Read the dataset

In [2]:
data = pd.read_csv("dataset/data.csv", sep = ";")

In [3]:
text = data["text"].tolist()
labels = data["label"].tolist()

Preprocess the dataset

In [4]:
def preprocess(A, labels):
    B = []
    labels_new = []
    for i in range(len(A)):

        text = A[i]
        # remove old style retweet text "RT"
        text = re.sub(r'^RT[\s]+', '', text)

        # remove hyperlinks
        text= re.sub(r'https?:\/\/.*[\r\n]*', '', text)

        # remove hashtags
        # only removing the hash # sign from the word
        text = re.sub(r'#', '', text)

        # remove tagging @
        text = re.sub(r"(?:\@|https?\://)\S+", "", text)

        # remove new line \n
        text = re.sub("\n", "", text)
        
        # remove numbers
        text = text.replace("[0-9]", " ")
    
        # to lower case
        text = text.lower()

        # remove zero length tweets
        if(len(text) == 0 ):
            continue

        B.append(text)
        labels_new.append(labels[i])
    return B, labels_new

In [5]:
texts, labels = preprocess(text, labels)
len(texts)

54395

In [6]:
texts = np.array(texts)
labels = np.array(labels)

In [7]:
model = KeyBERT('distilbert-base-nli-mean-tokens')

In [8]:
def get_keywords(doc, model):
    keywords = model.extract_keywords(doc)[:3]
    return keywords

### Select the label
Set the label variable to the label you wish to extract keywords for. Possible options are show in the following cell:

In [9]:
# choose the label
np.unique(labels)

array(['abusive', 'benevolent', 'cyberbulling', 'hate', 'hateful',
       'identity', 'insult', 'obscene', 'offensive', 'profane', 'racism',
       'sexism', 'spam', 'threat', 'toxic'], dtype='<U12')

In [10]:
label = "spam"
idx = labels == label

In [11]:
texts_f = texts[idx]

In [12]:
keywords = []

For each label get the keywords

In [13]:
for i in range(len(texts_f)):
    keywords.extend(get_keywords(texts_f[i], model))

In [14]:
keywords = [x[0] for x in keywords]

Get most common keywords of this label and show counts

In [15]:
w, c = np.unique(keywords, return_counts = True)
w = w[np.argsort(c)][-6::]
c = c[np.argsort(c)][-6::]
print(f"Keywords: {w[::-1]}")
print(f"Counts: {c[::-1]}")

Keywords: ['video' 'new' '2017' 'liked' 'free' 'april']
Counts: [335 323 170 162 124 124]


In [None]:
"""
abusive: fucking, idiot, bitch, hate, fuck
benevolent: women, womensday, sassy, adaywithoutwomen, woman
cyberbullying: riot, troll, hacking, trolls, hacker
hate: trumpisatraitor, doctorsfightback, shameonicc, borisjohnsonshouldnotbepm, trump
hateful: hate, trump, idiot, nigga, fucking
identity: gay, fuck, nigger, bitch, fucking
insult: fuck, wikipedia, bitch, fucking, suck
obscene: fuck, wikipedia, bitch, fucking, suck
offensive: trumpisatraitor, fucktrump, trump, murderer, rapist
profane: fucktrump, fuck, dickhead, trump, douchebag
racism: coon, white, black, terror, fuck
sexism: sexist, women, feminazi, girls, kat
spam: video, new, 2017, liked, free
threat: kill, die, fuck, bitch, rape, death
toxic: fuck, wikipedia, bitch, fucking, suck

"""