In [1]:
import re
import time

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer,util
import torch

# Local application imports
from metadata.document_metadata import *
from metadata.paragraph_metadata import *
from metadata.nlp_metadata import *
from config import config
from sdgs.sustainable_development_goals import *
from clean.clean_text import normalize_unicode

Using GPU: True


In [29]:
model = BERTopic(embedding_model='all-MiniLM-L6-v2')

paragraph_corpus = []

def process_document(document_collection_name, document):
    # Build a collection of tuples (document, paragraph)
    return list(build_paragraph_corpus(document_collection_name, document))


def build_paragraph_corpus(document_collection_name, document):
    paragraph_metadata_filename = config.get_paragraph_metadata_filename(document_collection_name, document.local_filename)
    paragraphs = load_paragraph_metadata(paragraph_metadata_filename)

    for paragraph in paragraphs:
        yield (document, paragraph)

def process_document_collections(document_collections):
    
    #
    # Unlike the SDG --> Sentence semantic similarity, in this case
    # we need to process all the docs at once
    #
    for document_collection_name in document_collections:

        print(f'{document_collection_name}')
        document_metadata_filename = config.get_document_metadata_filename(document_collection_name)
        documents = load_document_metadata(document_metadata_filename)
        
        for document in documents:
            print(f'{document.local_filename}')
            
            # Process document retuns a list of tuples of (document, paragraph)
            paragraph_corpus.extend(process_document(document_collection_name, document))
            print(f'{len(paragraph_corpus)} total paragraphs')
    
    # Build embeddings
    print(f'Topic Modelling {len(paragraph_corpus)} paragraphs; Please wait...')

In [30]:
process_document_collections(['IPBES', 'IPCC', 'IUCN', 'MA', 'OKR', 'UNICEF'])
#process_document_collections(['OKR', 'UNICEF'])

topics, probs = model.fit_transform([p[1].clean_text for p in paragraph_corpus])

IPBES
ipbes_assessment_report_ldra_EN.pdf
2533 total paragraphs
202111_2020 IPBES GLOBAL REPORT_FULL_DIGITAL_NOV 2021.pdf
5004 total paragraphs
IPCC
IPCC_AR6_WGI_Full_Report.pdf
7374 total paragraphs
IUCN
RL-267-001-En.pdf
7532 total paragraphs
2021-043-En.pdf
7794 total paragraphs
2021-034-En.pdf
7929 total paragraphs
2021-035-En.pdf
8226 total paragraphs
2020-002-En-Summ.pdf
8402 total paragraphs
2021-042-En.pdf
8902 total paragraphs
2021-036-En.pdf
10243 total paragraphs
MA
document.429.aspx.pdf
10423 total paragraphs
document.356.aspx.pdf
11084 total paragraphs
document.354.aspx.pdf
11587 total paragraphs
document.282.aspx.pdf
11855 total paragraphs
OKR
9781464818127.pdf
13497 total paragraphs
9781464818172.pdf
14065 total paragraphs
UNICEF
State-of-the-Worlds-Hand-Hygiene-report-2021.pdf
14418 total paragraphs
jmp-2021-wash-households_3.pdf
15413 total paragraphs
Topic Modelling 15413 paragraphs; Please wait...
huggingface/tokenizers: The current process just got forked, after par

In [31]:
model.visualize_barchart()

In [32]:
classes = [p[0].title for p in paragraph_corpus]
docs = [p[1].clean_text for p in paragraph_corpus]
topics_per_class = model.topics_per_class(docs, topics, classes=classes)

model.visualize_topics_per_class(topics_per_class, top_n_topics=50)

In [33]:
import pandas as pd

df = pd.DataFrame(zip(
    [p[0].title for p in paragraph_corpus],
    [p[1].paragraph_number for p in paragraph_corpus],
    topics,
    probs))
df

Unnamed: 0,0,1,2,3
0,The IPBES assessment report on land degradatio...,6,163,0.954911
1,The IPBES assessment report on land degradatio...,7,-1,0.000000
2,The IPBES assessment report on land degradatio...,9,77,0.418910
3,The IPBES assessment report on land degradatio...,13,3,1.000000
4,The IPBES assessment report on land degradatio...,24,3,0.911277
...,...,...,...,...
15408,"Progress on household drinking water, sanitati...",8,8,0.966017
15409,"Progress on household drinking water, sanitati...",9,8,0.676773
15410,"Progress on household drinking water, sanitati...",11,0,1.000000
15411,"Progress on household drinking water, sanitati...",12,0,1.000000


In [40]:
model.get_topic(108)

[('urban', 0.05662432045765791),
 ('urbanization', 0.032408070752082165),
 ('infrastructure', 0.029760591950267336),
 ('countriesa', 0.02636790654881125),
 ('industrial', 0.024609193179964614),
 ('population', 0.02379736540685975),
 ('expansion', 0.021646506650837644),
 ('fastest', 0.019378805867260384),
 ('areas', 0.017589968655540134),
 ('rural', 0.016758706717772062)]

In [39]:
model.find_topics('urban emission reduction')

([108, 27, 248, 1, 197],
 [0.48837225889821007,
  0.42385275735492345,
  0.4122117454034694,
  0.40832755861798375,
  0.4069519099064797])