Imports

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import spacy
from sklearn.preprocessing import normalize
from collections import Counter
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap

import plotly.express as px

In [2]:
data = pd.read_json("data/response_NTNE.json")
print(data.shape)
print(data.columns)

(1052, 48)
Index(['score', 'temperature', 'id', 'lastModification', 'publicationDate',
       'mainJob', 'jobs', 'contentLanguage', 'locations', 'contractTypes',
       'jobType', 'skills', 'title', 'highlight', 'description', 'metaTitle',
       'metaDescription', 'profileDescription', 'reference',
       'externalReference', 'companyDescription', 'permanent', 'mobility',
       'drivingLicence', 'company', 'locality', 'status', 'pushedContractType',
       'experienceInJobRequired', 'recruiter', 'publisher', 'anonymous', 'url',
       'salary', 'labels', 'focus', 'freeOffer', 'offerPremium',
       'jobCategories', 'language', 'top', 'benefits', 'unknownJob',
       'contractPeriod', 'availabilityDate', 'availabilityEndDate',
       'organizationUnit', 'publicationPartners'],
      dtype='object')


In [3]:
def clean_html(html_text:str):
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text(separator=" ")
    # Normalize whitespace
    text = " ".join(text.split())
    return text

corpus = data["description"].apply(clean_html)

In [4]:
nlp = spacy.load("fr_core_news_sm", disable=["parser", "ner", "tagger"])

In [18]:
def tokenize(doc):
    """Tokenize with spaCy and keep only real tokens (no punctuation/space)."""
    spacy_doc = nlp(doc)
    return [token.text.lower()           # or token.lemma_.lower() if you prefer lemmas
            for token in spacy_doc
            if token.is_alpha and not token.is_punct and not token.is_space and not token.like_num and len(token) > 1]

In [19]:
docs_tokenized = [tokenize(doc) for doc in corpus]
lexicon = sorted(set(token for tokens in docs_tokenized for token in tokens))

## Term Frequency (TF)

In [20]:
df_tf = pd.DataFrame(0.0, index=range(len(corpus)), columns=lexicon)

for idx, tokens in enumerate(docs_tokenized):
    bag = Counter(tokens)
    doc_len = len(tokens)
    for term, count in bag.items():
        df_tf.at[idx, term] = count / doc_len  # classic TF = count / document length

In [21]:
print(df_tf.shape)
df_tf.head()

(1052, 12657)


Unnamed: 0,aacsb,aad,aah,abb,ability,abiotiques,abl,abondement,abondée,abonnement,...,êtes,être,île,îlot,œil,œuvre,œuvrent,œuvres,œuvrez,œuvrons
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000918,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.001837,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Inverse Document Frequency (IDF)

In [22]:
num_documents = len(corpus)
doc_freq = Counter()

for tokens in docs_tokenized:
    doc_freq.update(set(tokens))  # one count per document per term

idf_values = {
    term: np.log((1 + num_documents) / (1 + doc_freq[term])) + 1
    for term in lexicon
}
df_idf = pd.Series(idf_values)

In [23]:
print(df_idf.shape)
df_idf.head()

(12657,)


aacsb      6.860786
aad        7.266251
aah        6.860786
abb        7.266251
ability    6.860786
dtype: float64

# TF-IDF matrix and L2 normalization

In [26]:
df_tf_idf = df_tf.multiply(df_idf, axis=1)

df_tf_idf_norm = pd.DataFrame(
    data=normalize(df_tf_idf.to_numpy(), norm="l2").round(2),
    columns=lexicon
)

In [27]:
print(df_tf_idf.shape)
df_tf_idf.head()

(1052, 12657)


Unnamed: 0,aacsb,aad,aah,abb,ability,abiotiques,abl,abondement,abondée,abonnement,...,êtes,être,île,îlot,œil,œuvre,œuvrent,œuvres,œuvrez,œuvrons
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005831,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004492,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
