
Installation des packages
---



In [1]:
!pip install requests pandas tqdm nltk bertopic plotly -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports et configuration

In [2]:
import requests
import pandas as pd
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from bertopic import BERTopic
import plotly.graph_objects as go
import plotly.express as px

# Téléchargements NLTK
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("Imports terminés!")

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


Imports terminés!


# Extraction des données

In [3]:
base_url = "https://api.openalex.org/works"
all_works = []

# PARAMÈTRES À AJUSTER SI BESOIN
start_year = 2021
end_year = 2025
per_page = 200
max_results = 2000

# Filtre temporel
date_filter = f"from_publication_date:{start_year}-01-01,to_publication_date:{end_year}-12-31"
max_pages = (max_results // per_page) + 1

print(f"Extraction de {start_year} à {end_year}...")

for page in tqdm(range(1, max_pages + 1), desc="Extraction"):
    params = {
        "filter": date_filter,
        "per-page": per_page,
        "page": page,
        "sort": "cited_by_count:desc"
    }

    response = requests.get(base_url, params=params, timeout=30)

    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])

        if not results:
            break

        all_works.extend(results)

        if len(all_works) >= max_results:
            all_works = all_works[:max_results]
            break
    else:
        print(f"Erreur: {response.status_code}")
        break

print(f"{len(all_works)} articles extraits!")


Extraction de 2021 à 2025...


Extraction:   0%|          | 0/11 [00:00<?, ?it/s]

2000 articles extraits!


# Création du DataFrame

In [4]:
def reconstruct_abstract(inv_index):
    if not inv_index:
        return ""
    words = []
    for word, positions in inv_index.items():
        for pos in positions:
            words.append((pos, word))
    words.sort()
    return " ".join([w for _, w in words])

print("Création du DataFrame...")

df = pd.DataFrame([{
    "title": w.get("display_name", ""),
    "abstract": reconstruct_abstract(w.get("abstract_inverted_index", {})),
    "publication_date": w.get("publication_date", ""),
    "doi": w.get("doi", ""),
    "concepts": [c["display_name"] for c in w.get("concepts", [])],
    "authors": [a["author"]["display_name"] for a in w.get("authorships", [])],
    "venue": w.get("host_venue", {}).get("display_name", ""),
    "cited_by_count": w.get("cited_by_count", 0),
    "open_access": w.get("open_access", {}).get("is_oa", False)
} for w in all_works])

# Extraire l'année
df['year'] = pd.to_datetime(df['publication_date'], errors='coerce').dt.year

print(f"DataFrame créé: {len(df)} lignes")
df.head()

Création du DataFrame...
DataFrame créé: 2000 lignes


Unnamed: 0,title,abstract,publication_date,doi,concepts,authors,venue,cited_by_count,open_access,year
0,Global Cancer Statistics 2020: GLOBOCAN Estima...,Abstract This article provides an update on th...,2021-02-04,https://doi.org/10.3322/caac.21660,"[Medicine, Cancer, Breast cancer, Skin cancer,...","[Hyuna Sung, Jacques Ferlay, Rebecca L. Siegel...",,107581,True,2021
1,The PRISMA 2020 statement: an updated guidelin...,The Preferred Reporting Items for Systematic r...,2021-03-29,https://doi.org/10.1136/bmj.n71,"[Checklist, Systematic review, Guideline, Stat...","[Matthew J. Page, Joanne E. McKenzie, Patrick ...",,76381,True,2021
2,MizAR 60 for Mizar 50,"As a present to Mizar on its 50th anniversary,...",2023-01-01,https://doi.org/10.4230/lipics.itp.2023.19,"[Computer science, Machine translation, Transf...","[Ashish Vaswani, Noam Shazeer, Niki Parmar, Ja...",,70225,True,2023
3,,"Designing complex, dynamic yet multi-functiona...",2022-01-01,https://doi.org/10.4230/lipics.dna.28.4,"[Regret, Computer science, Mathematical optimi...","[Diederik P. Kingma, Jimmy Ba]",,49593,True,2022
4,EMBI,Requirements are an integral part of industry ...,2024-06-27,https://doi.org/10.5281/zenodo.12561108,"[Transformer, Computer science, Training (mete...","[Jacob Devlin, Ming‐Wei Chang, Kenton Lee, Kri...",,44944,True,2024


# Nettoyage du texte

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words
             if w not in stop_words and len(w) > 2]
    return " ".join(words)

print(" Nettoyage du texte...")

df['text'] = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).apply(clean_text)
df = df[df['text'].str.len() > 50].reset_index(drop=True)

print(f"{len(df)} articles avec texte valide")
df[['title', 'year', 'text']].head()

🧹 Nettoyage du texte...
1605 articles avec texte valide


Unnamed: 0,title,year,text
0,Global Cancer Statistics 2020: GLOBOCAN Estima...,2021,global cancer statistic globocan estimate inci...
1,The PRISMA 2020 statement: an updated guidelin...,2021,prisma statement updated guideline reporting s...
2,MizAR 60 for Mizar 50,2023,mizar mizar present mizar anniversary develop ...
3,,2022,designing complex dynamic yet multifunctional ...
4,EMBI,2024,embi requirement integral part industry operat...


# Topic Modeling

In [6]:
print("Détection des topics...")

texts = df['text'].tolist()

topic_model = BERTopic(
    language="english",
    calculate_probabilities=True,
    min_topic_size=10,
    nr_topics=8,
    verbose=False
)

topics, probs = topic_model.fit_transform(texts)

df['topic'] = topics
df['topic_prob'] = [max(prob) for prob in probs]

# Statistiques
n_topics = len(df[df['topic'] != -1]['topic'].unique())
print(f"{n_topics} topics identifiés!")

# Afficher les topics
topic_info = topic_model.get_topic_info()
print("\ Topics principaux:")
for _, row in topic_info.head(10).iterrows():
    if row['Topic'] != -1:
        keywords = ', '.join(row['Representation'][:5])
        print(f"\nTopic {row['Topic']}: {row['Name']}")
        print(f"  Mots-clés: {keywords}")
        print(f"  Articles: {row['Count']}")


  print("\ Topics principaux:")


Détection des topics...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

7 topics identifiés!
\ Topics principaux:

Topic 0: 0_model_image_learning_network
  Mots-clés: model, image, learning, network, task
  Articles: 286

Topic 1: 1_disease_patient_covid_health
  Mots-clés: disease, patient, covid, health, evidence
  Articles: 273

Topic 2: 2_research_chatgpt_review_climate
  Mots-clés: research, chatgpt, review, climate, study
  Articles: 236

Topic 3: 3_cancer_cell_death_incidence
  Mots-clés: cancer, cell, death, incidence, disease
  Articles: 148

Topic 4: 4_material_quantum_property_application
  Mots-clés: material, quantum, property, application, battery
  Articles: 116

Topic 5: 5_protein_genome_sequence_gene
  Mots-clés: protein, genome, sequence, gene, database
  Articles: 91

Topic 6: 6_mmlmrow_data_mmlmnmmlmn_mmlmommlmo
  Mots-clés: mmlmrow, data, mmlmnmmlmn, mmlmommlmo, xmlnshttpwwwworgmathmathml
  Articles: 21


# Analyse des tendances

In [7]:
print("\n Analyse des tendances temporelles...")

# Créer le pivot des tendances
trend_df = df[df['topic'] != -1].groupby(['year', 'topic']).size().reset_index(name='count')
trend_pivot = trend_df.pivot(index='year', columns='topic', values='count').fillna(0)

print(f" Données de {len(trend_pivot)} années")
trend_pivot


 Analyse des tendances temporelles...
 Données de 5 années


topic,0,1,2,3,4,5,6
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021,132.0,166.0,96.0,84.0,79.0,49.0,7.0
2022,73.0,68.0,45.0,44.0,25.0,26.0,8.0
2023,34.0,23.0,57.0,13.0,6.0,12.0,4.0
2024,38.0,15.0,24.0,4.0,3.0,4.0,2.0
2025,9.0,1.0,14.0,3.0,3.0,0.0,0.0


# Visualisation

In [8]:
print("Création du graphique...")

# Top 5 topics
top_topics = df[df['topic'] != -1]['topic'].value_counts().head(5).index.tolist()
trend_plot = trend_pivot[top_topics]

# Créer le graphique
fig = go.Figure()

colors = px.colors.qualitative.Set2

for i, topic in enumerate(trend_plot.columns):
    fig.add_trace(go.Scatter(
        x=trend_plot.index,
        y=trend_plot[topic],
        mode='lines+markers',
        name=f'Topic {topic}',
        line=dict(width=2.5, color=colors[i % len(colors)]),
        marker=dict(size=8)
    ))

fig.update_layout(
    title='Tendances scientifiques émergentes (2021-2025)',
    xaxis_title='Année',
    yaxis_title='Nombre de publications',
    hovermode='x unified',
    template='plotly_white',
    height=600,
    legend=dict(orientation="v", yanchor="top", y=1, xanchor="left", x=1.02)
)

fig.show()

Création du graphique...


# Sauvegarde des résultats

In [9]:
output_file = 'openalex_trends_2021_2025.csv'

df_export = df[[
    'title', 'abstract', 'publication_date', 'year',
    'doi', 'cited_by_count', 'topic', 'topic_prob',
    'authors', 'venue', 'open_access'
]].copy()

df_export.to_csv(output_file, index=False)

print(f" Fichier sauvegardé: {output_file}")
print(f"   {len(df_export)} lignes exportées")
print("\n ANALYSE TERMINÉE!")

 Fichier sauvegardé: openalex_trends_2021_2025.csv
   1605 lignes exportées

 ANALYSE TERMINÉE!
