Imports

In [41]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import spacy
from sentence_transformers import SentenceTransformer
import torch
from bertopic import BERTopic
import umap

import plotly.express as px


Load data

In [17]:
data = pd.read_json("data/response_NTNE.json")
print(data.shape)
print(data.columns)

(1052, 48)
Index(['score', 'temperature', 'id', 'lastModification', 'publicationDate',
       'mainJob', 'jobs', 'contentLanguage', 'locations', 'contractTypes',
       'jobType', 'skills', 'title', 'highlight', 'description', 'metaTitle',
       'metaDescription', 'profileDescription', 'reference',
       'externalReference', 'companyDescription', 'permanent', 'mobility',
       'drivingLicence', 'company', 'locality', 'status', 'pushedContractType',
       'experienceInJobRequired', 'recruiter', 'publisher', 'anonymous', 'url',
       'salary', 'labels', 'focus', 'freeOffer', 'offerPremium',
       'jobCategories', 'language', 'top', 'benefits', 'unknownJob',
       'contractPeriod', 'availabilityDate', 'availabilityEndDate',
       'organizationUnit', 'publicationPartners'],
      dtype='object')


## Clean data (html to string)

In [18]:
def clean_html(html_text:str):
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text(separator=" ")
    # Normalize whitespace
    text = " ".join(text.split())
    return text

data["description_clean"] = data["description"].apply(clean_html)

## Chunk the desciptions

In [19]:
def chunk_text(nlp:spacy.Language, text:str, max_tokens=200):
    doc = nlp(text)
    chunks = []
    chunk = []
    length = 0
    
    for sentence in doc.sents:
        # Filter out stopwords, punctuation, numbers, and short tokens
        filtered_tokens = [
            token.lemma_.lower()
            for token in sentence
            if not token.is_stop
            and not token.is_punct
            and not token.like_num
            and len(token) > 2
        ]
        
        if not filtered_tokens:
            continue
        
        sent_tokens = len(filtered_tokens)
        # If adding this sentence exceeds max_tokens, save current chunk
        if length + sent_tokens > max_tokens and chunk:
            chunks.append(" ".join(chunk))
            chunk = []
            length = 0
        chunk.append(" ".join(filtered_tokens))
        length += sent_tokens
    # Add remaining chunk
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

In [20]:
nlp = spacy.load("fr_core_news_sm")
chunks_list = data["description_clean"].apply(lambda text: chunk_text(nlp, text)).to_list()

In [21]:
print(len(chunks_list))
print(chunks_list[0][0])

1052
futur équipe rejoignez équipe digital assystem responsabilité carole responsable activité data owner jouer rôle cler gestion valorisation donnée sein pôle digital transformation services accompagner client projet lier gouvernance donnée évolution produit digital business intelligence garantir qualité cohérence accessibilité donnée utilisateur métier principal mission gouvernance donnée assurer qualité intégrité accessibilité donnée enrichir dictionnaire entreprise appliquer règle standard modélisation donnée identifier modéliser implémenter objet métier domaine évoluer structurer métamodèle megahopex qualité donnée définir règle standard qualité donnée responsable domaine veiller implémentation exposition donnée disponible donnée périmètre technologie api flux digital pass collaboration animation organiser atelier utilisateur accompagner équipe métier gestion usage donnée grâce expertise garantir gestion optimal donnée contribuez activemer mise valeur améliorer performance transfo

## Compute embeddings

Load a model and compute embeddings

In [22]:
# Choose a model - all-MiniLM-L6-v2 is fast and effective
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Flatten chunks for batching, while keeping track of indices
flat_chunks = [chunk for doc_chunks in chunks_list for chunk in doc_chunks]

# Compute embeddings
chunk_embeddings = model.encode(flat_chunks, batch_size=32, convert_to_tensor=True, show_progress_bar=True)

Batches: 100%|██████████| 54/54 [00:02<00:00, 19.10it/s]


In [23]:
print(len(chunk_embeddings))

1701


Aggregate chunk embeddings per job offer

In [24]:
job_embeddings = []
idx = 0
for doc_chunks in chunks_list:
    num_chunks = len(doc_chunks)
    doc_emb = chunk_embeddings[idx:idx + num_chunks].mean(dim=0)
    job_embeddings.append(doc_emb)
    idx += num_chunks

# Convert to numpy arrays
job_embeddings = torch.stack(job_embeddings).cpu().numpy()

In [25]:
print(len(job_embeddings))
print(len(job_embeddings[0]))

1052
384


## Extract topics and cluster embeddings

In [26]:
descriptions = [" ".join(chunks) for chunks in chunks_list]

topic_model = BERTopic(language="french")
topics, probs = topic_model.fit_transform(documents=descriptions,
                                          embeddings=job_embeddings)

In [27]:
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,367,-1_projet_équipe_client_technique,"[projet, équipe, client, technique, donnée, so...",[poste proposer ingénieur commercial pme sein ...
1,0,96,0_embarquer_design_rattachement_situer,"[embarquer, design, rattachement, situer, élec...",[souhaite intégrer entreprise dynamique reconn...
2,1,51,1_data_donnée_dater_solution,"[data, donnée, dater, solution, probayes, prob...",[probayes vocation proposer client solution in...
3,2,48,2_gestion_direction_donnée_pilotage,"[gestion, direction, donnée, pilotage, mission...",[mission principal piloter développer activité...
4,3,42,3_électrique_production_aventech_électromécanique,"[électrique, production, aventech, électroméca...",[aventech pme collaborateur être expert concep...
5,4,37,4_équipe_projet_and_entreprise,"[équipe, projet, and, entreprise, expérience, ...",[bon business manager ingénieur formation sein...
6,5,34,5_entreprise_hara_france_rse,"[entreprise, hara, france, rse, conseil, consu...",[description entreprise hara consulting commun...
7,6,31,6_logiciel_technique_industriel_projet,"[logiciel, technique, industriel, projet, équi...",[avoir envier jouer rôle cler qualité solution...
8,7,31,7_steria_sopra_microsoft_entreprise,"[steria, sopra, microsoft, entreprise, tech, e...",[organisateur forum recrutement talents handic...
9,8,27,8_technique_prime_rattachement_situer,"[technique, prime, rattachement, situer, logic...",[soc architect design leader jouer rôle cler d...


## Visualize document gavitating around topics

In [32]:
doc_info = topic_model.get_document_info(descriptions)
# Contains columns: "Document", "Topic", "Probability", etc.

doc_embeddings = np.vstack(job_embeddings)
topic_embeddings = topic_model.topic_embeddings_  # shape: (num_topics, dim)

Reduce to 2 dimensions

In [None]:
# TODO: Peut etre utiliser des auto-encoder

combined_embeddings = np.vstack([topic_embeddings, doc_embeddings])
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
coords = reducer.fit_transform(combined_embeddings)

num_topics = topic_embeddings.shape[0]
topic_coords = coords[:num_topics]
doc_coords = coords[num_topics:]

Build dataframe for plotting

In [50]:
topic_info = topic_model.get_topic_info()
topic_label_map = dict(zip(topic_info["Topic"], topic_info["Name"]))

topic_df = pd.DataFrame({
    "x": topic_coords[:, 0],
    "y": topic_coords[:, 1],
    "type": "topic",
    "topic_id": range(num_topics),
    "label": [topic_label_map.get(i, f"Topic {i}") for i in range(num_topics)]
})

doc_df = pd.DataFrame({
    "x": doc_coords[:, 0],
    "y": doc_coords[:, 1],
    "type": "document",
    "topic_id": doc_info["Topic"].values,
    "probability": doc_info["Probability"].values,
    "text": doc_info["Document"].values
})

Plot

In [51]:
fig = px.scatter(
    pd.concat([topic_df, doc_df], ignore_index=True),
    x="x", y="y",
    color="type",
    hover_data=["label", "topic_id", "probability", "text"],
    symbol="type"
)
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed