In [1]:
import re
import time

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer,util
import torch

# Local application imports
from metadata.document_metadata import *
from metadata.paragraph_metadata import *
from metadata.nlp_metadata import *
from config import config
from sdgs.sustainable_development_goals import *
from clean.clean_text import normalize_unicode

Using GPU: True


In [2]:
model = BERTopic(embedding_model='all-MiniLM-L6-v2')

paragraph_corpus = []

def process_document(document_collection_name, document):
    # Build a collection of tuples (document, paragraph)
    return list(build_paragraph_corpus(document_collection_name, document))


def build_paragraph_corpus(document_collection_name, document):
    paragraph_metadata_filename = config.get_paragraph_metadata_filename(document_collection_name, document.local_filename)
    paragraphs = load_paragraph_metadata(paragraph_metadata_filename)

    for paragraph in paragraphs:
        yield (document, paragraph)

def process_document_collections(document_collections):
    
    #
    # Unlike the SDG --> Sentence semantic similarity, in this case
    # we need to process all the docs at once
    #
    for document_collection_name in document_collections:

        print(f'{document_collection_name}')
        document_metadata_filename = config.get_document_metadata_filename(document_collection_name)
        documents = load_document_metadata(document_metadata_filename)
        
        for document in documents:
            print(f'{document.local_filename}')
            
            # Process document retuns a list of tuples of (document, paragraph)
            paragraph_corpus.extend(process_document(document_collection_name, document))
            print(f'{len(paragraph_corpus)} total paragraphs')
    
    # Build embeddings
    print(f'Topic Modelling {len(paragraph_corpus)} paragraphs; Please wait...')

In [3]:
#process_document_collections(['IPBES', 'IPCC', 'IUCN', 'MA', 'OKR', 'UNICEF'])
process_document_collections(['OKR', 'UNICEF'])

topics, probs = model.fit_transform([p[1].clean_text for p in paragraph_corpus])

OKR
9781464818127.pdf
1642 total paragraphs
9781464818172.pdf
2210 total paragraphs
UNICEF
State-of-the-Worlds-Hand-Hygiene-report-2021.pdf
2563 total paragraphs
jmp-2021-wash-households_3.pdf
3558 total paragraphs
Topic Modelling 3558 paragraphs; Please wait...


In [None]:
model.visualize_barchart()

In [None]:
classes = [p[0].title for p in paragraph_corpus]
docs = [p[1].clean_text for p in paragraph_corpus]
topics_per_class = model.topics_per_class(docs, topics, classes=classes)

model.visualize_topics_per_class(topics_per_class, top_n_topics=50)

In [4]:
import pandas as pd

df = pd.DataFrame(zip(
    [p[0].title for p in paragraph_corpus],
    [p[1].paragraph_number for p in paragraph_corpus],
    topics,
    probs))
df

Unnamed: 0,0,1,2,3
0,The IPBES assessment report on land degradatio...,6,176,1.000000
1,The IPBES assessment report on land degradatio...,7,235,0.305745
2,The IPBES assessment report on land degradatio...,9,79,0.247212
3,The IPBES assessment report on land degradatio...,13,0,0.835464
4,The IPBES assessment report on land degradatio...,24,-1,0.000000
...,...,...,...,...
15408,"Progress on household drinking water, sanitati...",8,8,0.835230
15409,"Progress on household drinking water, sanitati...",9,8,0.835064
15410,"Progress on household drinking water, sanitati...",11,21,1.000000
15411,"Progress on household drinking water, sanitati...",12,76,0.953132


In [18]:
model.get_topic_info(3)['Name'].item()

'3_women_law_business_laws'

In [25]:
model.get_representative_docs(3)

['0/ C an as fective, on par with oral rehydration therapy and most childhood vaccinations.32 A 2012 study by the Organization for Economic Co-operation and Development suggests that, in the organizations member states, investments in hand hygiene in health care facilities generate savings in health expenditure that are, on average, 15 times the implementation costs.33',
 'Such results are hard to interpret alone. However, the Disease Control Priorities project provides combined assessments of the cost-effectiveness of health interventions, measured in terms of the extent to which they can avert disability-adjusted life years. DALYs are the sum of years of potential life lost due to premature mortality and the years of productive life lost due to disability. In 2016, drawing on the study in Burkina Faso, the DCP project estimated that the cost for every DALY averted through handwashing was US$88-225. On this basis, the DCP project rated handwashing as a very cost-effective intervention

In [16]:
model.find_topics("Cyprus")

([223, 167, 162, 209, 138],
 [0.6093231769834813,
  0.5233966553222298,
  0.4966849238040982,
  0.4891034503732047,
  0.48801232719004406])

In [78]:
model.get_topic(9)

[('ice', 0.058086001666427925),
 ('mass', 0.02671803802367059),
 ('antarctic', 0.02637503963887775),
 ('arctic', 0.02578550143288435),
 ('snow', 0.023676150231618016),
 ('sea', 0.023338921286116608),
 ('glaciers', 0.02119118687775453),
 ('sheet', 0.02080255052706511),
 ('glacier', 0.014404248857851108),
 ('since', 0.012866614402918005)]