Imports

In [165]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

import spacy
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, HDBSCAN
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

import plotly.express as px
import plotly.graph_objects as go

Load data

In [2]:
data = pd.read_json("data/response_NTNE.json")
print(data.shape)
print(data.columns)

(1052, 48)
Index(['score', 'temperature', 'id', 'lastModification', 'publicationDate',
       'mainJob', 'jobs', 'contentLanguage', 'locations', 'contractTypes',
       'jobType', 'skills', 'title', 'highlight', 'description', 'metaTitle',
       'metaDescription', 'profileDescription', 'reference',
       'externalReference', 'companyDescription', 'permanent', 'mobility',
       'drivingLicence', 'company', 'locality', 'status', 'pushedContractType',
       'experienceInJobRequired', 'recruiter', 'publisher', 'anonymous', 'url',
       'salary', 'labels', 'focus', 'freeOffer', 'offerPremium',
       'jobCategories', 'language', 'top', 'benefits', 'unknownJob',
       'contractPeriod', 'availabilityDate', 'availabilityEndDate',
       'organizationUnit', 'publicationPartners'],
      dtype='object')


## Clean descriptions (HTML to text)

In [3]:
def clean_html(html_text:str):
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text(separator=" ")
    # Normalize whitespace
    text = " ".join(text.split())
    return text

corpus = data["description"].apply(clean_html)

## Create TF-IDF matrice

Create tokenizer

In [4]:
nlp = spacy.load("fr_core_news_sm", disable=["parser", "ner", "tagger"])

import nltk
nltk.download('stopwords')
englisg_stops = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marinnagy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def spacy_tokenizer(doc):
    """Tokenize with spaCy and keep only real tokens (no punctuation/space)."""
    doc = nlp(doc)
    return [token.lemma_.lower()           # or token.lemma_.lower() if you prefer lemmas
            for token in doc
            if token.is_alpha and not token.is_punct and not token.is_space and not token.like_num and token.lemma_.lower() not in englisg_stops and len(token) > 1]

Create vectorizer

In [6]:
vectorizer = TfidfVectorizer(
    tokenizer=spacy_tokenizer,  # use spaCy
    token_pattern=None,         # disable default regex since we supply tokenizer
    min_df=3,                   # keep terms in ≥3 docs (tune to your corpus)
    max_df=0.5,                 # drop very common words
)

In [7]:
X = vectorizer.fit_transform(corpus)   # sparse TF–IDF matrix

In [8]:
X.shape

(1052, 4656)

## K-means Clustering

In [208]:
k = 5  # choose number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X)

In [209]:
sil = silhouette_score(X, labels)
sil

0.02755309428214567

In [210]:
df_clusters = pd.DataFrame({"text": corpus, "cluster": labels})
df_clusters.shape

(1052, 2)

> Inspect top terms per cluster

In [211]:
def top_terms_per_cluster(model, vectorizer, n_terms=10):
    terms = np.array(vectorizer.get_feature_names_out())
    for i, center in enumerate(model.cluster_centers_):
        top_idx = center.argsort()[::-1][:n_terms]
        print(f"Cluster {i}: {', '.join(terms[top_idx])}")

In [212]:
top_terms_per_cluster(kmeans, vectorizer, n_terms=8)

Cluster 0: team, insight, market, acros, project, électrique, group, technical
Cluster 1: commercial, cabinet, conseil, électrique, service, gestion, travail, collaborateur
Cluster 2: ia, cloud, architecture, logiciel, développeur, ci, application, donnée
Cluster 3: agence, rattachement, design, embarquer, elsys, situer, se, offre
Cluster 4: donnée, métier, bi, gestion, outil, analyse, power, assurer


## Reduce to 2 dimenssions

LSA to first reduce to 50 latent space

In [213]:
svd = TruncatedSVD(n_components=50, random_state=1)
X_latent = svd.fit_transform(X)

Scaling (improve PCA / UMAP stability)

In [214]:
X_latent = StandardScaler().fit_transform(X_latent)

In [215]:
X_2d = PCA(n_components=2, random_state=1).fit_transform(X_latent)

## Plot

In [216]:
plot_df = pd.DataFrame({
    "x": X_2d[:, 0],
    "y": X_2d[:, 1],
    "cluster": labels,
    "text": corpus
})

In [217]:
cluster_labels = {
    0: "Développeur",
    1: "Infirmier",
    2: "Chef de projet",
    3: "Commercial",
    4: "Test 1",
    5: "Test 2",
    6: "Test 3",
    7: "Test 4",
    8: "Test 5",
    9: "Test 6"
}
plot_df["topic_name"] = plot_df["cluster"].map(cluster_labels)

In [218]:
fig = px.scatter(
    plot_df,
    x="x", y="y",
    color="topic_name",
    hover_data={"text": True},
    title="Documents & topics (2D projection)"
)

# Add centroid labels
for cluster_id, group in plot_df.groupby("cluster"):
    cx, cy = group["x"].mean(), group["y"].mean()
    label = cluster_labels.get(cluster_id, f"Cluster {cluster_id}")
    fig.add_annotation(
        x=cx, y=cy,
        text=label,
        showarrow=False,
        font=dict(size=14, color="black"),
        bgcolor="rgba(255,255,255,0.7)"
    )

fig.show()

In [219]:
import plotly.express as px
import plotly.graph_objects as go


topic_names = [cluster_labels.get(i, f"Cluster {i}") for i in labels]

svd = TruncatedSVD(n_components=3, random_state=42)
X_3d = svd.fit_transform(X)

plot_df = pd.DataFrame({
    "x": X_3d[:, 0],
    "y": X_3d[:, 1],
    "z": X_3d[:, 2],
    "cluster": labels,
    "topic_name": topic_names,
    "text": corpus
})

# ------------------------------------------------------------------
# 4. Compute centroid for each cluster for labeling
# ------------------------------------------------------------------
centroids = (
    plot_df.groupby("cluster")[["x", "y", "z"]]
    .mean()
    .reset_index()
)
centroids["topic_name"] = centroids["cluster"].map(
    lambda cid: cluster_labels.get(cid, f"Cluster {cid}")
)

# ------------------------------------------------------------------
# 5. Plotly 3D scatter with topic labels
# ------------------------------------------------------------------
fig = px.scatter_3d(
    plot_df,
    x="x",
    y="y",
    z="z",
    color="topic_name",
    hover_data={"text": True, "cluster": True},
    title="Documents in 3D factorial space",
    opacity=0.75
)

# Add centroid labels
fig.add_trace(
    go.Scatter3d(
        x=centroids["x"],
        y=centroids["y"],
        z=centroids["z"],
        mode="text",
        text=centroids["topic_name"],
        textposition="top center",
        textfont=dict(size=14, color="black"),
        showlegend=False
    )
)

fig.update_layout(
    scene=dict(
        xaxis_title="Factor 1",
        yaxis_title="Factor 2",
        zaxis_title="Factor 3"
    ),
    legend_title="Topic"
)

fig.update_layout(
    height=900
)

fig.show()

Essayer clustering `DBSCAN` et `t-SNE` pour reduction de dimenssion

## Other approach
- TruncateSVD to reduce to 50 dimensions
- Kmeans clustering
- t-SNE to projet on 2 dimensions


#### Reduce to 50 dimensions

In [319]:
svd = TruncatedSVD(n_components=50, random_state=0)
normalizer = Normalizer(copy=False)

X_reduced = normalizer.fit_transform(svd.fit_transform(X))

k = 10  # k = min_samples
nbrs = NearestNeighbors(n_neighbors=k, metric='cosine').fit(X_reduced)
distances, _ = nbrs.kneighbors(X_reduced)
k_dist = np.sort(distances[:, -1])  # distance to the k-th neighbor

#### K-means clustering

In [320]:
kmeans = KMeans(n_clusters=10, random_state=42)
best_labels = kmeans.fit_predict(X_reduced)

#### Project on 2d space

In [321]:
tsne = TSNE(n_components=2, metric='cosine', init='pca', random_state=0, perplexity=30)
emb = tsne.fit_transform(X_reduced)

In [322]:
fig = px.scatter(
    x=emb[:,0], y=emb[:,1],
    color=best_labels.astype(str),
    title="t-SNE projection colored by DBSCAN clusters",
    labels={'x': 't-SNE 1', 'y': 't-SNE 2'}
)
fig.show()

In [323]:
tsne = TSNE(
    n_components=3,
    metric='cosine',
    init='pca',
    random_state=0,
    perplexity=30
)
emb_3d = tsne.fit_transform(X_reduced)

In [324]:
fig = px.scatter_3d(
    x=emb_3d[:, 0],
    y=emb_3d[:, 1],
    z=emb_3d[:, 2],
    color=best_labels.astype(str),
    symbol=np.where(best_labels == -1, 'circle-open', 'circle'),
    title="3D t-SNE projection colored by DBSCAN clusters",
    labels={'x': 't-SNE 1', 'y': 't-SNE 2', 'z': 't-SNE 3'},
    opacity=0.8
)
fig.update_traces(marker=dict(size=4))

fig.update_layout(
    height=900
)

fig.show()