In [7]:
import sys
!{sys.executable} -m pip install sentence-transformers





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

Chargement des données

In [10]:
profiles_df = pd.read_csv("Data/profile_sans_doublons_2.csv").fillna("")
offers_df = pd.read_excel("Data/df_clean_offres.xlsx").fillna("")

Configuration des poids par champ : Pondération des champs (selon importance pour le matching)

In [11]:
weights = {
    "Metier_regroupe": 0.35,
    "Points_forts": 0.25,
    "Compétence": 0.25,
    "Contrat": 0.05,
    "Expérience": 0.10
}

Correspondances entre colonnes des profils et des offres

In [12]:
field_map = {
    "Metier_regroupe": "groupe_metier",
    "Points_forts": "stack_technique",
    "Compétence": "stack_technique",
    "Contrat": "Contrat",
    "Expérience": "Experience"
}

Chargement du modèle SBERT : adapté pour les textes en français

In [13]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Calcul des embeddings + similarités pondérées

In [14]:
# Matrice finale de similarité
similarity_total = np.zeros((len(profiles_df), len(offers_df)))

In [None]:
# Pour chaque champ 
for profile_field, weight in weights.items():
    
    offer_field = field_map[profile_field]
    profile_texts = profiles_df[profile_field].astype(str).tolist()
    offer_texts = offers_df[offer_field].astype(str).tolist()

    # Embeddings
    profile_embeddings = model.encode(profile_texts, show_progress_bar=True)
    offer_embeddings = model.encode(offer_texts, show_progress_bar=True)

    # Similarité
    sim_matrix = cosine_similarity(profile_embeddings, offer_embeddings)

    # Pondération et ajout à la matrice finale
    similarity_total += weight * sim_matrix


Batches: 100%|██████████| 22/22 [00:01<00:00, 12.15it/s]
Batches: 100%|██████████| 70/70 [00:04<00:00, 17.27it/s]
Batches: 100%|██████████| 22/22 [00:04<00:00,  5.46it/s]
Batches: 100%|██████████| 70/70 [00:29<00:00,  2.34it/s]
Batches: 100%|██████████| 22/22 [00:08<00:00,  2.53it/s]
Batches: 100%|██████████| 70/70 [00:28<00:00,  2.48it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 24.01it/s]
Batches: 100%|██████████| 70/70 [00:04<00:00, 14.27it/s]
Batches: 100%|██████████| 22/22 [00:03<00:00,  5.64it/s]
Batches: 100%|██████████| 70/70 [00:06<00:00, 11.58it/s]


Extraction du Top 10 des paires les plus similaires

 Pour chaque profil on va garder l'offre avec le meilleur score uniquement pour eviter la redondance

In [19]:
best_matches = []
for i in range(similarity_total.shape[0]):
    best_offer_index = np.argmax(similarity_total[i])
    best_score = similarity_total[i, best_offer_index]
    best_matches.append((i, best_offer_index, best_score))

# Trier par score décroissant et prendre les top 50
top_50_unique = sorted(best_matches, key=lambda x: x[2], reverse=True)[:50]


Affichage des résultats

In [20]:
print("\nTop 10 Paires Offre/Profil (SBERT Pondéré, sans doublon profil) :\n")
for i, j, score in top_50_unique[:10]:
    print(f"Profil : {profiles_df.iloc[i]['Profil'][:60]}...")
    print(f"Offre  : {offers_df.iloc[j]['Nom_poste']}")
    print(f"Score pondéré SBERT : {round(score, 3)}")
    print("---")


Top 10 Paires Offre/Profil (SBERT Pondéré, sans doublon profil) :

Profil : DATA ANALYST...
Offre  : Data analyst (H/F)
Score pondéré SBERT : 0.93
---
Profil : chargé d'étude statistique/Data scientist...
Offre  : Data Scientist H/F
Score pondéré SBERT : 0.817
---
Profil : Data Analyst / Développeur BI...
Offre  : Data analyste informatique (IT) / Freelance (H/F)
Score pondéré SBERT : 0.8
---
Profil : Data Analyst...
Offre  : Data Analyst F/H - Système, réseaux, données (H/F)
Score pondéré SBERT : 0.787
---
Profil : A la recherche d'un emploi  CDI en Actuariat -Data Science...
Offre  : INGENIEUR GENIE ELECTRIQUE CFA - ACTIVITE DATA CENTER (H/F)
Score pondéré SBERT : 0.78
---
Profil : DATA SCIENTIST...
Offre  : Data Scientist H/F
Score pondéré SBERT : 0.777
---
Profil : Data analyst/ Data Engineer...
Offre  : Data analyst (H/F)
Score pondéré SBERT : 0.776
---
Profil : DATA SCIENTIST...
Offre  : #SALONDEMANDELIEU2024 : Machine Learning Engineer  (H/F)
Score pondéré SBERT : 0.775
---
Pro

Export des Top 50 correspondances complètes

In [21]:
export_rows = []
for i, j, score in top_50_unique:
    profile_data = profiles_df.loc[i, ['Profil', 'Metier_regroupe', 'Points_forts', 'Compétence', 'Contrat', 'Expérience']].to_dict()
    offer_data = offers_df.loc[j, ['Nom_poste', 'groupe_metier', 'stack_technique', 'Contrat', 'Experience']].to_dict()
    row = {**profile_data, **offer_data, "Score_SBERT": round(score, 3)}
    export_rows.append(row)

result_df = pd.DataFrame(export_rows)
result_df.to_excel("Data/Top_50_Match_Profils_Offres_Uniques_SBERT.xlsx", index=False)