In [1]:
# Import des modules généraux
import pandas as pd
import re
from collections import Counter
from dotenv import load_dotenv
import os

# Import des modules de NLP
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer

import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Import des modules de clustering
from hdbscan import HDBSCAN
from umap import UMAP

# Import des modules de visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Import de openai pour la représentation des topics améliorée
import openai

# Chargement des variables d'environnement et initialisations
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
nlp = spacy.load("fr_core_news_md")

def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42

Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x1e0de098150>

In [2]:
# Texte complet contenant tous les articles (data\Articles CleRMa.xlsx)
df = pd.read_excel("data/Articles CleRMa.xlsx")

# Les year qui n'existent pas sont des NaN, on les drop
df = df.dropna(subset=['Year'])

# Les year sont des float, on les convertit en int
df['Year'] = df['Year'].astype(int)
df

Unnamed: 0,Cites,Authors,Title,Year,Source,Publisher,ArticleURL,CitesURL,GSRank,QueryDate,...,StartPage,EndPage,ECC,CitesPerYear,CitesPerAuthor,AuthorCount,Age,Abstract,FullTextURL,RelatedURL
0,677,"A Aouadi, S Marsat",Do ESG controversies matter for firm value? Ev...,2018,Journal of business ethics,Springer,https://link.springer.com/article/10.1007/s105...,https://scholar.google.com/scholar?cites=74317...,6.0,2024-03-19 21:03:25,...,,,677.0,112.83,339.0,2.0,6.0,The aim of this paper is to investigate the re...,https://www.jstor.org/stable/pdf/45022714.pdf?...,https://scholar.google.com/scholar?q=related:V...
1,193,D Talbot,Les institutions créatrices de proximités: Ins...,2008,Revue d'économie régionale &urbaine,cairn.info,https://www.cairn.info/revue-d-economie-region...,https://scholar.google.com/scholar?cites=31866...,10.0,2024-03-20 08:00:14,...,,,193.0,12.06,193.0,1.0,16.0,En mobilisant les apports des institutionnalis...,https://www.cairn.info/revue-d-economie-region...,https://scholar.google.com/scholar?q=related:2...
2,177,"M Arouri, G Pijourlet",CSR performance and the value of cash holdings...,2017,Journal of Business Ethics,Springer,https://link.springer.com/article/10.1007/s105...,https://scholar.google.com/scholar?cites=17119...,1.0,2024-03-19 20:53:14,...,,,177.0,25.29,89.0,2.0,7.0,"Using a worldwide sample, we examine whether c...",https://www.jstor.org/stable/pdf/44164262.pdf?...,https://scholar.google.com/scholar?q=related:8...
3,166,"P Lacomme, M Larabi, N Tchernev",Job-shop based framework for simultaneous sche...,2013,International Journal of Production …,Elsevier,https://www.sciencedirect.com/science/article/...,https://scholar.google.com/scholar?cites=13514...,1.0,2024-03-20 08:02:20,...,,,166.0,15.09,55.0,3.0,11.0,This paper deals with the problem of simultane...,https://scholar.google.com/scholar?output=inst...,https://scholar.google.com/scholar?q=related:x...
4,165,"F Aubert, G Grudnitski",The impact and importance of mandatory adoptio...,2011,… Financial Management &Accounting,Wiley Online Library,https://onlinelibrary.wiley.com/doi/abs/10.111...,https://scholar.google.com/scholar?cites=10244...,4.0,2024-03-19 20:40:42,...,,,165.0,12.69,83.0,2.0,13.0,… significant relationship between accounting ...,https://onlinelibrary.wiley.com/doi/pdf/10.111...,https://scholar.google.com/scholar?q=related:N...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,0,"C Zumbo-lebrument, N Lebrument, ...",Quels sont les déterminants de l'acceptation d...,2021,Systèmes d'Information et …,revuesim.org,http://revuesim.org/index.php/sim/article/view...,,66.0,2024-03-20 07:59:28,...,,,0.0,0.00,0.0,3.0,3.0,"In recent years, the smart mobility inherent i...",http://revuesim.org/index.php/sim/article/view...,https://scholar.google.com/scholar?q=related:-...
980,0,RM Borges,Les brevets sur les inventions biotechnologiqu...,2013,The Innovation Journal,innovation.cc,https://innovation.cc/wp-content/uploads/2013_...,,2.0,2024-03-19 20:41:41,...,,,0.0,0.00,0.0,1.0,11.0,Les demandes de brevet portant sur des végétau...,https://innovation.cc/wp-content/uploads/2013_...,https://scholar.google.com/scholar?q=related:E...
981,0,"F Cassiere, G Paché",Internationalization of large retailers and cr...,2010,Timisoara Journal of Economics,ideas.repec.org,https://ideas.repec.org/a/wun/journl/tjev03y20...,,9.0,2024-03-19 20:41:57,...,,,0.0,0.00,0.0,2.0,14.0,This article discusses the large retailers' in...,,https://scholar.google.com/scholar?q=related:F...
983,0,"JF Hoarau, M Goujon, F Rivière",Compared economic and environnemental vulnerab...,2014,Tourism specialization and …,inria.hal.science,https://inria.hal.science/hal-01483903/,,69.0,2024-03-19 20:52:37,...,,,0.0,0.00,0.0,3.0,10.0,Compared economic and environnemental vulnerab...,,https://scholar.google.com/scholar?q=related:m...


In [3]:
# Initialize an empty list to store the rows
data = []

for index, row in df.iterrows():
    date = row["Year"]
    sentences = row["Title"]
    
    # Tokenize the content into sentences
    #sentences = sent_tokenize(sentences)

    # Enlever les caractères spéciaux, les espaces en trop et mettre en minuscule
    #sentences = [re.sub(r"[^a-zA-Z0-9\s]", "", sentence) for sentence in sentences]
    #sentences = [" ".join(sentence.split()) for sentence in sentences]
    sentences = sentences.lower()
    
    # Enlever les phrases qui ont moins de 16 mots
    #sentences = [sentence for sentence in sentences if len(word_tokenize(sentence)) > 16]

    # Enlever les phrases en langue étrangère
    #sentences = [sentence for sentence in sentences if nlp(sentence)._.language["language"] == "fr"]

    # Enlever les phrases qui ont plus de 10 caractères spéciaux
    #sentences = [sentence for sentence in sentences if len(re.findall(r"[^a-zA-Z0-9\s]", sentence)) < 10]

    # Enlever les phrases contenant "operator"
    #sentences = [sentence for sentence in sentences if "operator" not in sentence.lower()]

    # Enlever les " --" à l'intérieur des phrases
    #sentences = [sentence.replace(" --", "") for sentence in sentences]

    # Enlever les phrases en double
    #sentences = list(set(sentences))
    
    # Append each sentence with its date to the list
    data.extend([[date, sentences]])

# Create a new DataFrame from the accumulated data
df_dates_sentences = pd.DataFrame(data, columns=["Date", "Sentence"])

# Supprimer les phrases en double
df_dates_sentences = df_dates_sentences.drop_duplicates(subset=["Sentence"])

df_dates_sentences

Unnamed: 0,Date,Sentence
0,2018,do esg controversies matter for firm value? ev...
1,2008,les institutions créatrices de proximités: ins...
2,2017,csr performance and the value of cash holdings...
3,2013,job-shop based framework for simultaneous sche...
4,2011,the impact and importance of mandatory adoptio...
...,...,...
907,2021,quels sont les déterminants de l'acceptation d...
909,2013,les brevets sur les inventions biotechnologiqu...
910,2010,internationalization of large retailers and cr...
911,2014,compared economic and environnemental vulnerab...


In [4]:
# Print le nombre de mots par phrase en moyenne avec l'écart-type
print(f"Moyenne: {df_dates_sentences['Sentence'].str.split().str.len().mean()}")
print(f"Ecart-type: {df_dates_sentences['Sentence'].str.split().str.len().std()}")

dates = df_dates_sentences["Date"].tolist()
sentences = df_dates_sentences["Sentence"].tolist()

# Print the length of the dates and sentences
print(f"\nNumber of dates: {len(dates)}")
print(f"Number of sentences: {len(sentences)}")

Moyenne: 12.654462242562929
Ecart-type: 5.317378152875759

Number of dates: 874
Number of sentences: 874


In [5]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [6]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
"""
hdbscan_model = HDBSCAN(min_cluster_size=40, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="french", min_df=2, ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=False)
"""

'\nhdbscan_model = HDBSCAN(min_cluster_size=40, metric=\'euclidean\', cluster_selection_method=\'eom\', prediction_data=True)\nvectorizer_model = CountVectorizer(stop_words="french", min_df=2, ngram_range=(1, 2))\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=False)\n'

In [7]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5 Turbo
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

En se basant sur les information ci-dessus, extrais un titre très descriptif du topic en français de maximum 7 mots. C'est un topic qui doit représenter le sujet des recherche mener dans un laboratoire en sciences de gestion. Make sure it is in the following format:
topic: <topic label>
"""

client = openai.OpenAI(api_key=openai_api_key)
openai_model = OpenAI(client,
                       model="gpt-4",
                       exponential_backoff=True, 
                       chat=True, 
                       nr_docs=5,
                       prompt=prompt)

# Création du modèle de topic modeling
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [8]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  #hdbscan_model=hdbscan_model,
  #vectorizer_model=vectorizer_model,
  representation_model=representation_model,
  #ctfidf_model=ctfidf_model,

  # Hyperparameters
  #language="french",
  #top_n_words=10,
  #min_topic_size=70,
  verbose=True
)

topics, probs = topic_model.fit_transform(sentences, embeddings)

2024-03-26 13:09:43,925 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-26 13:09:50,115 - BERTopic - Dimensionality - Completed ✓
2024-03-26 13:09:50,117 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-26 13:09:50,142 - BERTopic - Cluster - Completed ✓
2024-03-26 13:09:50,145 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 22/22 [00:34<00:00,  1.56s/it]
2024-03-26 13:10:28,512 - BERTopic - Representation - Completed ✓


In [9]:
gpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
gpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(gpt_topic_labels)

In [10]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(metric='cosine', random_state=42).fit_transform(embeddings)

In [11]:
# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
fig = topic_model.visualize_documents(sentences,
                                reduced_embeddings=reduced_embeddings,
                                custom_labels=True)
fig

In [14]:
# On sauvagarde la fig html
fig.write_html("output/visualize_documents_all.html")

In [15]:
# Faire pour la période après 2013
df_dates_sentences = df_dates_sentences[df_dates_sentences["Date"] > 2013]

dates = df_dates_sentences["Date"].tolist()
sentences = df_dates_sentences["Sentence"].tolist()

# Pre-calculate embeddings
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

topics, probs = topic_model.fit_transform(sentences, embeddings)

gpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
gpt_topic_labels[-1] = "Outlier Topic"

topic_model.set_topic_labels(gpt_topic_labels)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(metric='cosine', random_state=42).fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
fig = topic_model.visualize_documents(sentences,
                                reduced_embeddings=reduced_embeddings,
                                custom_labels=True)

# On sauvagarde la fig html
fig.write_html("output/visualize_documents_after_2013.html")

# Faire pour la période après 2018
df_dates_sentences = df_dates_sentences[df_dates_sentences["Date"] > 2018]

dates = df_dates_sentences["Date"].tolist()
sentences = df_dates_sentences["Sentence"].tolist()

# Pre-calculate embeddings
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

topics, probs = topic_model.fit_transform(sentences, embeddings)

gpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
gpt_topic_labels[-1] = "Outlier Topic"

topic_model.set_topic_labels(gpt_topic_labels)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(metric='cosine', random_state=42).fit_transform(embeddings)

# Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts
fig = topic_model.visualize_documents(sentences,
                                reduced_embeddings=reduced_embeddings,
                                custom_labels=True)

# On sauvagarde la fig html
fig.write_html("output/visualize_documents_after_2018.html")

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

## Modélisation des topics dynamique

In [62]:
# La ligne suivante permet de lancer le DTM (Dynamic Topic Modeling) sur les données
topics_over_time = topic_model.topics_over_time(sentences, dates, evolution_tuning=True)

33it [00:00, 81.67it/s] 


In [68]:
# Comme il s'agit d'un plotly, nous pouvons également tracer les étiquettes des sujets. (limiter de 2013 à 2023)
fig_dtm = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True, normalize_frequency=True)
fig_dtm.update_xaxes(range=[2013, 2023])
fig_dtm.write_html("output/visualize_topics_over_time.html")
fig_dtm