In [1]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
import spacy


In [22]:
dataset = pd.read_excel('datasets/Dataset-Verse-by-Verse.xlsx')
dataset.rename(columns={'EnglishTranslation':'text'}, inplace=True)

In [4]:
'''
For Arabic Stop words , DONT RUN FOR ENGLISH
'''
with open('Datasets/list.txt' , encoding = 'utf-8') as f:
    STOPWORDS = f.read().splitlines()
    
# remove stopwords from text
dataset['text'] = dataset['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (STOPWORDS)]))

In [23]:
# remove punctuations
dataset['text'] = dataset['text'].str.replace('[^\w\s]','')
# remove numbers
dataset['text'] = dataset['text'].str.replace('\d+', '')
# Lower case everything
dataset['text'] = dataset['text'].str.lower()
# remove extra spaces
dataset['text'] = dataset['text'].str.strip()
# remove stop words using spacy
nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
dataset['text'] = dataset['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [24]:
dataset['text']

0                         allah gracious merciful
1         praise allah cherisher sustainer worlds
2                               gracious merciful
3                             master day judgment
4                     thee worship thine aid seek
                          ...                    
6231                           king ruler mankind
6232                            god judge mankind
6233    mischief whisperer evil withdraws whisper
6234                      whispers hearts mankind
6235                                    jinns men
Name: text, Length: 6236, dtype: object

In [25]:
berto = BERTopic(language="english" , nr_topics= 30 , top_n_words=30)
berto.fit_transform(dataset.text)

([2,
  -1,
  2,
  8,
  -1,
  9,
  9,
  -1,
  9,
  0,
  -1,
  -1,
  19,
  -1,
  23,
  3,
  -1,
  -1,
  -1,
  -1,
  29,
  -1,
  -1,
  7,
  -1,
  7,
  7,
  -1,
  7,
  -1,
  19,
  1,
  19,
  -1,
  3,
  -1,
  22,
  -1,
  -1,
  -1,
  22,
  -1,
  -1,
  -1,
  -1,
  19,
  -1,
  19,
  3,
  0,
  -1,
  -1,
  14,
  -1,
  -1,
  13,
  13,
  10,
  2,
  10,
  -1,
  10,
  -1,
  7,
  -1,
  -1,
  10,
  10,
  25,
  -1,
  2,
  28,
  -1,
  10,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  5,
  10,
  -1,
  0,
  2,
  0,
  22,
  19,
  11,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  10,
  -1,
  0,
  9,
  -1,
  -1,
  -1,
  -1,
  15,
  4,
  11,
  -1,
  9,
  19,
  -1,
  -1,
  21,
  21,
  21,
  21,
  2,
  5,
  12,
  -1,
  21,
  21,
  -1,
  21,
  21,
  9,
  -1,
  3,
  21,
  -1,
  -1,
  5,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  5,
  19,
  -1,
  -1,
  -1,
  -1,
  9,
  11,
  11,
  2,
  19,
  6,
  2,
  -1,
  6,
  6,
  

In [26]:
berto.generate_topic_labels()

['-1_allah_ye_lord',
 '0_reward_charity_regular',
 '1_gardens_rivers_garden',
 '2_merciful_mercy_forgive',
 '3_falsehood_truth_allah',
 '4_created_moon_heavens',
 '5_messenger_message_sent',
 '6_penalty_grievous_allah',
 '7_mountains_sky_rain',
 '8_deny_favours_judgment',
 '9_path_astray_guidance',
 '10_moses_said_throw',
 '11_signs_sign_ancients',
 '12_deeds_righteousness_righteous',
 '13_pharaoh_chiefs_people',
 '14_ye_soon_shall',
 '15_earth_heavens_glory',
 '16_fathers_father_satan',
 '17_fear_peace_obey',
 '18_thou_thee_thy',
 '19_reject_faith_path',
 '20_punishment_taste_penalty',
 '21_abraham_jacob_isaac',
 '22_angels_iblis_angel',
 '23_day_account_judgment',
 '24_women_wives_divorce',
 '25_quran_arabic_tongue',
 '26_fire_blazing_hellfire',
 '27_cattle_eat_meat',
 '28_noah_ark_flood',
 '29_evil_ones_results']

In [28]:
berto.visualize_topics()

In [29]:
berto.visualize_documents(dataset['text'])