# Install dependencies in a virtual environment

Install packages for efficiently parsing CSV files.

In [None]:
!pip install casanova

Install packages for NLP analysis.

In [None]:
!pip install bertopic hdbscan nltk sentence_transformers scikit-learn umap

Also install `protobuf` version 3.20 because `SentenceTransformer` needs it.

In [None]:
!pip install protobuf==3.20.0

Install tools for visualizing the results.

In [None]:
!pip install networkx ipysigma plotly install pelote

# Prepare data

We're going to be taking advantage of transformers and word-embedding, which in theory do not require pre-processing sentences that are sent to the topic model. Nevertheless, some pre-processing can be useful (i.e. removing HTML tags). In our case, we will want to remove the phrase "Il faut" at the start of every proposition because it is not meaningful in the context of our corpus.

In [2]:
DATAFILE = "mieux-sinformer.csv" # change this according to file path

In [4]:
def special_preprocessing(string):
    string = string.lower()
    bow = string.split()
    if bow[:2] == ["il", "faut"]:
        bow = bow[2:]
    return " ".join(bow)

With that helper function in place, we'll parse our data file and create a list of our documents. (list of strings)

In [5]:
import casanova

with open(DATAFILE) as f:
    reader = casanova.reader(f)
    docs = [special_preprocessing(cell) for cell in reader.cells(column="Proposition")]
    print(f"Dataset includes {len(docs)} docs.")

Dataset includes 1723 docs.


We will also need to prepare a list of stop words, which will be excluded from the topics' representations.

In [4]:
from nltk.corpus import stopwords

stoplist = stopwords.words("french")
ADDITIONAL_STOPWORDS = ["plus", "chaque", "tout", "tous", "toutes", "toute", "leur", "leurs", "comme", "afin", "pendant", "lorsque"]
stoplist.extend(ADDITIONAL_STOPWORDS)

# Vectorize the data

### Word embeddings
The first step is to transform a string (sentence) into an array of numbers (vector) or in other words, to "vectorize" the text. There are many ways to do this. By default, `BERTopic`'s privileges word embeddings, which are currently the most powerful way to encode a document because they represent words' relationships to other words in the context of a sentence.

However, `BERTopic`'s default sentence transformer was trained on English-language texts. Because we are working with a corpus of sentences exclusively in French, we want to take advantage of a language model trained on French-language texts, such as the `CamemBERT` model. Fortunately, a data scientist at *La Javaness* Van Tuan Dang ("dangvantuan" on HuggingFace) has trained and published a sentence transformer model based on the `CamemBERT` base model. We will simply import this pre-trained transformer.

In [18]:
from sentence_transformers import SentenceTransformer

sentences = ['Bonjour tout le monde.', 'Ça va ?']

camembert_sentence_transformer = SentenceTransformer("dangvantuan/sentence-camembert-large")
example_embeddings = camembert_sentence_transformer.encode(sentences, show_progress_bar=True)

print(f"Sentence 1 has {len(sentences[0])} characters, and the embedding is {len(example_embeddings[0])} long.")
print(f"Sentence 2 has {len(sentences[1])} characters, and the embedding is {len(example_embeddings[1])} long.")

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Sentence 1 has 22 characters, and the embedding is 1024 long.
Sentence 2 has 7 characters, and the embedding is 1024 long.


As seen in the cell above, the embeddings created by Dang's sentence transformer `sentence-camembert-large` have the same length, despite the fact that the sentences the arrays represent are different lenghts. The embeddings have the same length because, rather than directly representing words as numbers (aka bag of words), the transformer creates a rich representation that takes into account 1024 dimensions.

### Construct the topic model with the desired parameters

Finally, having decided on an effective embedding model (very important!) and produced stop words for the topics' representations, we're ready to assemble the topic model.

In [35]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

stoplist = stopwords.words("french")
ADDITIONAL_STOPWORDS = ["plus", "chaque", "tout", "tous", "toutes", "toute", "leur", "leurs", "comme", "afin", "pendant", "lorsque"]

def set_model_parameters(embedding_model):

    # Step 1 - Extract embeddings
    embedding_model = embedding_model

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(angular_rp_forest=True, metric='cosine', n_components=10, n_neighbors=30, min_dist=0.1)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=13, min_samples=3, prediction_data=True, metric='euclidean', cluster_selection_method='eom')

    # Step 4 - Tokenize topics
    stoplist.extend(ADDITIONAL_STOPWORDS)
    vectorizer_model = CountVectorizer(stop_words=stoplist, ngram_range=(1, 3))

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)

    # Topic model
    return BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        diversity=0.5,
        n_gram_range=(1,3),
        nr_topics='auto'
    )

# Fit the topic model

Download the pre-trained embedding model from `SentenceTransformers` and attribute it to the variable `embedding model`. Then use that model to encode all the documents in our corpus.

In [23]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dangvantuan/sentence-camembert-large")
embeddings = camembert_sentence_transformer.encode(docs, show_progress_bar=True)

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


NameError: name 'camembert_sentence_transformer' is not defined

Create an instance of the model with all the parameters defined and our selected embedding model.

In [36]:
topic_model = set_model_parameters(embedding_model)

Fit the model to the documents in our corpus and to the embeddings we prepared.

In [52]:
topic_model.fit(docs, embeddings)

<bertopic._bertopic.BERTopic at 0x3febfead0>

In [53]:
topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, word_length=None, separator='  --  ')

['-1  --  éducation médias  --  jeunes  --  donner  --  citoyens  --  informer',
 '0  --  financement  --  concentration médias  --  milliardaires  --  indépendants  --  indépendance médias',
 '1  --  fake news  --  vérifiée  --  qualité  --  plusieurs  --  sources information',
 '2  --  réseaux sociaux  --  éducation médias information  --  professeurs documentalistes  --  facebook  --  responsables',
 '3  --  experts  --  opinion  --  différence entre  --  pensée  --  gauche',
 '4  --  sujets  --  buzz  --  faits divers  --  jt  --  info',
 '5  --  analyser  --  école  --  apprendre  --  jeune âge  --  développer',
 '6  --  arrêtent  --  conditionnel  --  sans  --  être  --  essaient',
 '7  --  presse  --  acheter  --  rendre accessible  --  papier  --  gratuité',
 '8  --  continu  --  ue  --  chaînes information  --  belges  --  affranchir',
 '9  --  auteurs  --  financièrement  --  amendes  --  diffusant  --  élus',
 '10  --  éthique  --  charte munich  --  conseil déontologie  -- 

In [54]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,569,-1_éducation médias_jeunes_donner_citoyens
1,0,210,0_financement_concentration médias_milliardair...
2,1,178,1_fake news_vérifiée_qualité_plusieurs
3,2,152,2_réseaux sociaux_éducation médias information...
4,3,118,3_experts_opinion_différence entre_pensée
5,4,73,4_sujets_buzz_faits divers_jt
6,5,68,5_analyser_école_apprendre_jeune âge
7,6,56,6_arrêtent_conditionnel_sans_être
8,7,51,7_presse_acheter_rendre accessible_papier
9,8,47,8_continu_ue_chaînes information_belges


In [102]:
topic_model.save('models/11-04-2023_bertopic.model')

### Explore the model's predictions

In [6]:
from bertopic import BERTopic
# optional
topic_model = BERTopic.load('models/13-03-2023_bertopic.model')

In [7]:
topic_model = BERTopic.load('models/13-03-2023_bertopic.model')
topics_to_merge = [
    [3,4,6],[3,15]
]
topic_model.merge_topics(docs, topics_to_merge)
topics_to_merge = [
    [4,12]
]
topic_model.merge_topics(docs, topics_to_merge)
barchart = topic_model.visualize_barchart(top_n_topics=17, title="<b>Représentation des topics</b>", width=400, n_words=7)

barchart.write_html('topic_visualisations/barchart.html')
barchart.show(renderer='iframe_connected')

In [14]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,569,-1_sans_éducation médias_jeunes_donner,-1_sans_éducation médias_jeunes_donner
1,0,263,0_journalistes_politiques_sujets_débats,L'opinion & le journalisme
2,1,210,1_financement_concentration médias_milliardair...,Financement & indépendance des médias
3,2,178,2_fake news_vérifiée_qualité_plusieurs,Désinformation
4,3,152,3_réseaux sociaux_éducation médias information...,Formation au secondaire
5,4,85,4_esprit_développer_développer esprit critique...,Formation au primaire
6,5,51,5_rendre accessible_papier_abonnements gratuit...,Accès à l'information
7,6,47,6_europe_chaînes information continu_france té...,Chaînes d'information en continu
8,7,46,7_news_propos_diffusant_élus,Législation
9,8,45,8_éthique_charte munich_déontologie journalist...,Éthique du journalisme


In [8]:
topic_labels_dict = {
    0:"L'opinion & le journalisme",
    1:"Financement & indépendance des médias",
    2:"Désinformation",
    3:"Formation au secondaire",
    4:"Formation au primaire",
    5:"Accès à l'information",
    6:"Chaînes d'information en continu",
    7:"Législation",
    8:"Éthique du journalisme",
    9:"Désanoymisation en ligne",
    10:"Arnaques & influenceurs",
    11:"Échelles des médias",
    12:"Enseignement & l'EMI"   
}

topic_model.set_topic_labels(topic_labels_dict)
barchart = topic_model.visualize_barchart(top_n_topics=17, custom_labels=True, title="<b>Représentation des topics</b>", width=800, n_words=7)

barchart.write_html('topic_visualisations/barchart.html')
barchart.show(renderer='iframe_connected')

In [10]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.21it/s]

.
├─réseaux_professeurs documentalistes_esprit critique_emi_jeunes
│    ├─■──esprit_développer_développer esprit critique_analyser_âge ── Topic: 4
│    └─réseaux sociaux_professeurs documentalistes_emi_éducation médias information_algorithmes
│         ├─■──réseaux sociaux_éducation médias information_professeurs documentalistes_algorithmes_responsables ── Topic: 3
│         └─■──emploi temps élèves_emi professeurs_élèves_donner moyens professeurs_enseignement emi ── Topic: 12
└─journalistes_média_créer_indépendants_interdire
     ├─pubs_arnaques_identité_fake news_auteurs
     │    ├─■──anonymat réseaux sociaux_réseaux sociaux interdire_sociaux interdire_supprimer_vrai nom ── Topic: 9
     │    └─pubs_arnaques_fake news_auteurs_financièrement
     │         ├─■──news_propos_diffusant_élus_sanctionner médias ── Topic: 7
     │         └─■──sanctionner youtube lorsqu_pubs arnaques_médias informations_influenceurs_pub institutionnelles déno ── Topic: 10
     └─journalistes_média_sources_




In [13]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

hierarchy = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width=1200, custom_labels=topic_labels_dict)

hierarchy.write_html('topic_visualisations/hierarchy.html')
hierarchy.show(renderer='iframe_connected')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.26it/s]


In [15]:
heatmap = topic_model.visualize_heatmap(n_clusters=6, custom_labels=topic_labels_dict, width=1200)

heatmap.write_html('topic_visualisations/heatmap.html')
heatmap.show(renderer='iframe_connected')

In [95]:
topic_model.visualize_term_rank(log_scale=True).show(renderer='iframe_connected')

In [16]:
topic_model.visualize_topics().show(renderer='iframe_connected')

In [19]:
from umap import UMAP
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dangvantuan/sentence-camembert-large")
embeddings = camembert_sentence_transformer.encode(docs, show_progress_bar=True)

camembert_sentence_transformer = SentenceTransformer("dangvantuan/sentence-camembert-large")
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

No sentence-transformers model found with name /Users/kelly.christensen/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


In [20]:
doc_visualisation = topic_model.visualize_documents(docs=docs, reduced_embeddings=reduced_embeddings, width=1800, height=750, sample=0.1, custom_labels=topic_labels_dict)


doc_visualisation.write_html('topic_visualisations/doc_visualisation.html')
doc_visualisation.show(renderer='iframe_connected')

Print the results to a CSV file.

In [9]:
import casanova

PREDICTIONS_FILE = 'bertopic_topics_notebook.csv'

results = topic_model.get_document_info(docs=docs)
results.to_csv()
with open(DATAFILE) as f, open(PREDICTIONS_FILE, 'w') as of:
    enricher = casanova.enricher(f, of, add=["document", "topic", "name", "custom_name", "top_n_words", "probability", "representative_document"])
    for i, row in enumerate(enricher):
        enricher.writerow(row=row, add=[results["Document"][i], results["Topic"][i], results["Name"][i], results["CustomName"][i], results["Top_n_words"][i], results["Probability"][i], results["Representative_document"][i]])

# Visualize the results

In a dictionary, index the documents and their metadata by each document's Id.

In [26]:
import csv

with open(PREDICTIONS_FILE) as f:
    reader = csv.DictReader(f)
    for row in reader:
        doc_index = {row["Id"]:row for row in reader}

While parsing the matrix file, create nodes and edges and label them with data from the topic predictions.

In [27]:
MATRIX_FILE = 'defacto_covotes.csv'

In [28]:
import networkx as nx
from statistics import mean
import casanova

with open(MATRIX_FILE) as f:
    reader = casanova.reader(f)
    pid1_pos = reader.headers['pid1']
    pid2_pos = reader.headers['pid2']
    vote1_pos = reader.headers['vote1']
    vote2_pos = reader.headers['vote2']
    count_pos = reader.headers['count']
    
    G = nx.Graph()
    for row in reader:
        pid1 = row[pid1_pos]
        pid2 = row[pid2_pos]

        # If the matrix refers to documents not in the original data, skip 
        if not doc_index.get(pid1) or not doc_index.get(pid2):
            continue

        # Base an edge's weight on the support that two related propositions received
        average_nb_votes = mean([int(doc_index[pid1]["Nb de votes"]), int(doc_index[pid2]["Nb de votes"])])
        weight = int(row[count_pos])/average_nb_votes

        # Unless already added to the Graph, add both nodes in the matrix row and create an edge between them
        print(doc_index[pid1])
        if not G.has_node(pid1) and not str(doc_index[pid1]["topic"]) == "-1":
            G.add_node(pid1, label=doc_index[pid1]["Proposition"], **doc_index[pid1])

        if not G.has_node(pid2) and not str(doc_index[pid2]["topic"]) == "-1":
            G.add_node(pid2, label=doc_index[pid2]["Proposition"], **doc_index[pid2])

        if row[vote1_pos] == row[vote2_pos]:  # To-do: test if we can try all 3 types of cases
            G.add_edge(pid1, pid2, weight=weight)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import pelote

GEFX_FILE = 'sparsification.gexf'

H = pelote.multiscale_backbone(G, alpha=0.05)
nx.write_gexf(H, GEFX_FILE)

In [None]:
import networkx as nx
from ipysigma import Sigma

In [None]:
# Importing a gexf graph
g = nx.read_gexf(GEFX_FILE)

In [None]:
# Displaying the graph with a size mapped on degree and
# a color mapped on a categorical attribute of the nodes
Sigma(g, node_size=g.degree, node_color='topic')