# Install dependencies in a virtual environment

Install packages for efficiently parsing CSV files.

In [None]:
!pip install casanova

Install packages for NLP analysis.

In [None]:
!pip install bertopic==0.13.0 hdbscan nltk sentence_transformers scikit-learn umap

Also install `protobuf` version 3.20 because `SentenceTransformer` needs it.

In [None]:
!pip install protobuf==3.20.0

# Prepare data

We're going to be taking advantage of transformers and word-embedding, which in theory do not require pre-processing sentences that are sent to the topic model. Nevertheless, some pre-processing can be useful (i.e. removing HTML tags). In our case, we will want to remove the phrase "Il faut" at the start of every proposition because it is not meaningful in the context of our corpus.

In [1]:
DATAFILE = "mieux-sinformer.csv" # change this according to file path

In [2]:
def special_preprocessing(string):
    string = string.lower()
    bow = string.split()
    if bow[:2] == ["il", "faut"]:
        bow = bow[2:]
    return " ".join(bow)

With that helper function in place, we'll parse our data file and create a list of our documents. (list of strings)

In [3]:
import casanova

def get_docs(DATAFILE):
    with open(DATAFILE) as f:
        reader = casanova.reader(f)
        docs = [special_preprocessing(cell) for cell in reader.cells(column="Proposition")]
        print(f"Dataset includes {len(docs)} docs.")
        return docs

docs = get_docs(DATAFILE)

Dataset includes 1723 docs.


We will also need to prepare a list of stop words, which will be excluded from the topics' representations.

In [4]:
from nltk.corpus import stopwords

stoplist = stopwords.words("french")
ADDITIONAL_STOPWORDS = ["plus", "chaque", "tout", "tous", "toutes", "toute", "leur", "leurs", "comme", "afin", "pendant", "lorsque"]
stoplist.extend(ADDITIONAL_STOPWORDS)

# Vectorize the data

### Word embeddings
The first step is to transform a string (sentence) into an array of numbers (vector) or in other words, to "vectorize" the text. There are many ways to do this. By default, `BERTopic`'s privileges word embeddings, which are currently the most powerful way to encode a document because they represent words' relationships to other words in the context of a sentence.

However, `BERTopic`'s default sentence transformer was trained on English-language texts. Because we are working with a corpus of sentences exclusively in French, we want to take advantage of a language model trained on French-language texts, such as the `CamemBERT` model. Fortunately, a data scientist at *La Javaness* Van Tuan Dang ("dangvantuan" on HuggingFace) has trained and published a sentence transformer model based on the `CamemBERT` base model. We will simply import this pre-trained transformer.

In [5]:
from sentence_transformers import SentenceTransformer

sentences = ['Bonjour tout le monde.', 'Ça va ?']

camembert_sentence_transformer = SentenceTransformer("dangvantuan/sentence-camembert-large")
example_embeddings = camembert_sentence_transformer.encode(sentences, show_progress_bar=True)

print(f"Sentence 1 has {len(sentences[0])} characters, and the embedding is {len(example_embeddings[0])} long.")
print(f"Sentence 2 has {len(sentences[1])} characters, and the embedding is {len(example_embeddings[1])} long.")

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Sentence 1 has 22 characters, and the embedding is 1024 long.
Sentence 2 has 7 characters, and the embedding is 1024 long.


As seen in the cell above, the embeddings created by Dang's sentence transformer `sentence-camembert-large` have the same length, despite the fact that the sentences the arrays represent are different lenghts. The embeddings have the same length because, rather than directly representing words as numbers (aka bag of words), the transformer creates a rich representation that takes into account 1024 dimensions.

### Construct the topic model with the desired parameters

Finally, having decided on an effective embedding model (very important!) and produced stop words for the topics' representations, we're ready to assemble the topic model.

In [6]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

stoplist = stopwords.words("french")
ADDITIONAL_STOPWORDS = ["plus", "chaque", "tout", "tous", "toutes", "toute", "leur", "leurs", "comme", "afin", "pendant", "lorsque"]

def set_model_parameters(embedding_model):

    # Step 1 - Extract embeddings
    embedding_model = embedding_model

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(angular_rp_forest=True, metric='cosine', n_components=10, n_neighbors=30, min_dist=0.1)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=13, min_samples=3, prediction_data=True, metric='euclidean', cluster_selection_method='eom')

    # Step 4 - Tokenize topics
    stoplist.extend(ADDITIONAL_STOPWORDS)
    vectorizer_model = CountVectorizer(stop_words=stoplist, ngram_range=(1, 3))

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)

    # Topic model
    return BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        diversity=0.5,
        n_gram_range=(1,3),
        nr_topics='auto'
    )

# Fit the topic model

Download the pre-trained embedding model from `SentenceTransformers` and attribute it to the variable `embedding model`. Then use that model to encode all the documents in our corpus.

In [7]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dangvantuan/sentence-camembert-large")
embeddings = camembert_sentence_transformer.encode(docs, show_progress_bar=True)

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Create an instance of the model with all the parameters defined and our selected embedding model.

In [8]:
topic_model = set_model_parameters(embedding_model)

Fit the model to the documents in our corpus and to the embeddings we prepared.

In [9]:
topic_model.fit(docs, embeddings)

<bertopic._bertopic.BERTopic at 0x33fabe800>

In [10]:
topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, word_length=None, separator='  --  ')

['-1  --  sans  --  journalistes  --  sociaux  --  médias  --  sujets',
 '0  --  école  --  esprit critique  --  éducation médias information  --  développer  --  apprendre',
 '1  --  financement  --  concentration médias  --  milliardaires  --  indépendance médias  --  service public',
 '2  --  buzz  --  journalistes  --  faits divers  --  sujets  --  arrêter',
 '3  --  sanctionner  --  fake news  --  pubs  --  supprimer anonymat  --  arnaques',
 '4  --  rendre accessible  --  acheter  --  payer  --  presse écrite  --  netflix',
 '5  --  charte munich  --  déontologie journalistique  --  professionnelle  --  ordre  --  conseil déontologie',
 '6  --  gauche  --  orientation  --  lfi  --  partiaux  --  partis',
 '7  --  nutriscore  --  créer label  --  mettre place  --  score fiabilité  --  ia',
 '8  --  fake news médias  --  news créer  --  chargé  --  unité  --  dénoncer',
 '9  --  modérer  --  bulles filtre  --  confortent  --  réseaux sociaux favorisent  --  encadrer algorithmes',
 

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,572,-1_sans_journalistes_sociaux_médias
1,0,310,0_école_esprit critique_éducation médias infor...
2,1,223,1_financement_concentration médias_milliardair...
3,2,149,2_buzz_journalistes_faits divers_sujets
4,3,137,3_sanctionner_fake news_pubs_supprimer anonymat
5,4,69,4_rendre accessible_acheter_payer_presse écrite
6,5,47,5_charte munich_déontologie journalistique_pro...
7,6,45,6_gauche_orientation_lfi_partiaux
8,7,35,7_nutriscore_créer label_mettre place_score fi...
9,8,33,8_fake news médias_news créer_chargé_unité


In [12]:
from datetime import date
today = str(date.today())

topic_model.save(f'models/{today}_bertopic.model')

### Explore the model's predictions

In [13]:
from bertopic import BERTopic
# if loading saved model, run cells 6 and 7

topic_model = BERTopic.load('models/13-03-2023_bertopic.model')

In [14]:
topic_model = BERTopic.load('models/13-03-2023_bertopic.model')
topics_to_merge = [
    [3,4,6],[3,15]
]
topic_model.merge_topics(docs, topics_to_merge)
topics_to_merge = [
    [4,12]
]
topic_model.merge_topics(docs, topics_to_merge)
barchart = topic_model.visualize_barchart(top_n_topics=17, title="<b>Représentation des topics</b>", width=400, n_words=7)

# barchart.write_html('topic_visualisations/barchart.html')
barchart.show(renderer='iframe_connected')

In [15]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,569,-1_sans_éducation médias_jeunes_donner
1,0,263,0_journalistes_politiques_sujets_débats
2,1,210,1_financement_concentration médias_milliardair...
3,2,178,2_fake news_vérifiée_qualité_plusieurs
4,3,152,3_réseaux sociaux_éducation médias information...
5,4,85,4_esprit_développer_développer esprit critique...
6,5,51,5_rendre accessible_papier_abonnements gratuit...
7,6,47,6_europe_chaînes information continu_france té...
8,7,46,7_news_propos_diffusant_élus
9,8,45,8_éthique_charte munich_déontologie journalist...


In [16]:
topic_labels_dict = {
    0:"L'opinion & le journalisme",
    1:"Financement & l'indépendance des médias",
    2:"Désinformation",
    3:"Formation au secondaire",
    4:"Formation au primaire",
    5:"Accès à l'information",
    6:"Chaînes d'information en continu",
    7:"Législation",
    8:"Éthique du journalisme",
    9:"Désanoymisation en ligne",
    10:"Arnaques & influenceurs",
    11:"Échelles des médias",
    12:"Enseignement & l'EMI"   
}

topic_model.set_topic_labels(topic_labels_dict)
barchart = topic_model.visualize_barchart(top_n_topics=17, custom_labels=True, title="<b>Représentation des topics</b>", width=800, n_words=7)

# barchart.write_html('../docs/topic_visualisations/barchart.html')
barchart.show(renderer='iframe_connected')

In [17]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.29it/s]

.
├─réseaux_professeurs documentalistes_esprit critique_emi_jeunes
│    ├─■──esprit_développer_développer esprit critique_analyser_âge ── Topic: 4
│    └─réseaux sociaux_professeurs documentalistes_emi_éducation médias information_algorithmes
│         ├─■──réseaux sociaux_éducation médias information_professeurs documentalistes_algorithmes_responsables ── Topic: 3
│         └─■──emploi temps élèves_emi professeurs_élèves_donner moyens professeurs_enseignement emi ── Topic: 12
└─journalistes_média_créer_indépendants_interdire
     ├─pubs_arnaques_identité_fake news_auteurs
     │    ├─■──anonymat réseaux sociaux_réseaux sociaux interdire_sociaux interdire_supprimer_vrai nom ── Topic: 9
     │    └─pubs_arnaques_fake news_auteurs_financièrement
     │         ├─■──news_propos_diffusant_élus_sanctionner médias ── Topic: 7
     │         └─■──sanctionner youtube lorsqu_pubs arnaques_médias informations_influenceurs_pub institutionnelles déno ── Topic: 10
     └─journalistes_média_sources_




In [18]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

hierarchy = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width=1200, custom_labels=topic_labels_dict)

# hierarchy.write_html('../docs/topic_visualisations/hierarchy.html')
hierarchy.show(renderer='iframe_connected')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.23it/s]


In [19]:
heatmap = topic_model.visualize_heatmap(n_clusters=6, custom_labels=topic_labels_dict, width=1200)

# heatmap.write_html('../docs/topic_visualisations/heatmap.html')
heatmap.show(renderer='iframe_connected')

In [20]:
topic_model.visualize_term_rank(log_scale=True).show(renderer='iframe_connected')

In [21]:
topic_model.visualize_topics().show(renderer='iframe_connected')

In [22]:
from umap import UMAP
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dangvantuan/sentence-camembert-large")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

camembert_sentence_transformer = SentenceTransformer("dangvantuan/sentence-camembert-large")
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


In [23]:
doc_visualisation = topic_model.visualize_documents(docs=docs, reduced_embeddings=reduced_embeddings, width=1800, height=750, sample=0.1, custom_labels=topic_labels_dict)

# doc_visualisation.write_html('../docs/topic_visualisations/doc_visualisation.html')
doc_visualisation.show(renderer='iframe_connected')

Print the results to a CSV file.

In [24]:
import casanova

PREDICTIONS_FILE = 'bertopic_topics_notebook.csv'

results = topic_model.get_document_info(docs=docs)
results.to_csv()
with open(DATAFILE) as f, open(PREDICTIONS_FILE, 'w') as of:
    enricher = casanova.enricher(f, of, add=["document", "topic", "name", "custom_name", "top_n_words", "probability", "representative_document"])
    for i, row in enumerate(enricher):
        enricher.writerow(row=row, add=[results["Document"][i], results["Topic"][i], results["Name"][i], results["CustomName"][i], results["Top_n_words"][i], results["Probability"][i], results["Representative_document"][i]])