<a href="https://colab.research.google.com/github/michaelmml/NLP-Information-Extraction/blob/main/BERTopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install sentence_transformers

In [None]:
!pip install newspaper3k

In [None]:
!pip install BERTopic

In [6]:
from sentence_transformers import SentenceTransformer, util
from googlesearch import search
from newspaper import Article
from bertopic import BERTopic
from nltk import tokenize
from tqdm import tqdm
import nltk
import re

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
def scrape_urls(query, num):
    urls = list(search(query, stop=num))
    return urls

In [9]:
urls = scrape_urls(query='russia', num=50)
len(urls)

50

In [10]:
def get_sentences(url):

    try:
      article = Article(url)
      article.download()
      article.parse()
    except:
      pass
    if article:
      sentences = tokenize.sent_tokenize(article.text)
      sentences = [re.sub(r'\d+', '', sentence) for sentence in sentences]
      sentences = [re.sub(r'[^\w\s]','',sentence) for sentence in sentences]
      sentences = [re.sub('\n ', '', sentence) for sentence in sentences]
      sentences = [re.sub('\n', ' ', sentence) for sentence in sentences]
      sentences = [sentence for sentence in sentences if len(sentence) > 20]
      duplicates = list(set([s for s in sentences if sentences.count(s) > 1]))
      cleaned_sentences = list(set(sentences))
      
      return cleaned_sentences

In [None]:
sentences = get_sentences(urls[0])
sentences

In [12]:
def remove_similar_sentences(sentences, model, similarity_threshold):

    new_sentences = sentences.copy()
    embeddings = model.encode(new_sentences, show_progress_bar=False)
    
    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    # Find most similar pairs
    pairs = []
    for i in range(len(cosine_scores)-1):
        for j in range(i+1, len(cosine_scores)):
            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
    # Get similar pairs
    similar = []
    for pair in pairs:
      if pair['score'] > similarity_threshold:
        similar.append(pair['index'])
    # Get indeces of similar pair
    del_indices = []
    for i in similar:
      del_indices.append(min(i))
    del_indices = set(del_indices)
    # Delete similar sentences
    try:
      for i in del_indices:
        del new_sentences[i]
    except:
      pass
    return new_sentences

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
threshold = 0.7
new_sentences = remove_similar_sentences(sentences, model, threshold)

sentences, len(sentences), new_sentences, len(new_sentences)

In [15]:
def get_documents(urls):

    documents = []
    for url in tqdm(urls):  
      sentences = get_sentences(url)
      new_sentences = remove_similar_sentences(sentences, model, .65)
      documents.append(new_sentences)
    flat_list = [i for sub_list in documents for i in sub_list]
    return flat_list

In [16]:
sent_list = get_documents(urls)
print('total number of sentences:', len(sent_list))

100%|██████████| 50/50 [02:22<00:00,  2.85s/it]

total number of sentences: 1678





n_gram_range represents the minimum and the maximum number of words you want in the topic.

min_topic_size is the minimum number of sentences per topic.

nr_topics is the number of topics.

In [17]:
model = BERTopic(n_gram_range=(1,3), 
                 min_topic_size=10,
                 nr_topics='auto')

In [None]:
topics, probabilities = model.fit_transform(sent_list)

In [None]:
model.get_topic_freq()

In [None]:
model.get_topics()

In [22]:
model.visualize_topics()

In [23]:
model.visualize_barchart(top_n_topics=21, n_words=3)