In [1]:
import sys
!{sys.executable} -m pip install sentence-transformers



In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
profiles_df = pd.read_csv("Data/profile_model.csv").fillna("")
offers_df = pd.read_csv("Data/offres_model.csv").fillna("")

In [None]:
df_offres = offers_df.copy()
df_offres['offre_id'] = df_offres.index  


In [5]:
df_candidats = profiles_df.copy()
df_candidats['candidat_id'] = df_candidats.index  


In [6]:
weights = {
    "Metier_regroupe": 0.20,
    "Points_forts": 0.15,
    "Compétence": 0.15,
    "Contrat": 0.15,
    "Experience_mois": 0.20,
    "Departement": 0.15
}

In [8]:
field_map = {
    "Metier_regroupe": "groupe_metier",
    "Points_forts": "stack_technique",
    "Compétence": "stack_technique",
    "Contrat": "Contrat",
    "Experience_mois": "experience_mois",
    "Departement": "Departement",
}


In [9]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [10]:
similarity_total = np.zeros((len(profiles_df), len(offers_df)))

In [11]:
for profile_field, weight in weights.items():
    
    offer_field = field_map[profile_field]
    profile_texts = profiles_df[profile_field].astype(str).tolist()
    offer_texts = offers_df[offer_field].astype(str).tolist()

    profile_embeddings = model.encode(profile_texts, show_progress_bar=True)
    offer_embeddings = model.encode(offer_texts, show_progress_bar=True)

    sim_matrix = cosine_similarity(profile_embeddings, offer_embeddings)

    similarity_total += weight * sim_matrix


Batches: 100%|██████████| 22/22 [00:02<00:00,  8.13it/s]
Batches: 100%|██████████| 70/70 [00:05<00:00, 13.42it/s]
Batches: 100%|██████████| 22/22 [00:07<00:00,  2.94it/s]
Batches: 100%|██████████| 70/70 [00:58<00:00,  1.20it/s]
Batches: 100%|██████████| 22/22 [00:18<00:00,  1.21it/s]
Batches: 100%|██████████| 70/70 [00:58<00:00,  1.19it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 13.58it/s]
Batches: 100%|██████████| 70/70 [00:04<00:00, 15.70it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 13.52it/s]
Batches: 100%|██████████| 70/70 [00:04<00:00, 16.47it/s]
Batches: 100%|██████████| 22/22 [00:01<00:00, 20.41it/s]
Batches: 100%|██████████| 70/70 [00:09<00:00,  7.18it/s]


In [12]:
best_matches = []
for i in range(similarity_total.shape[0]):
    best_offer_index = np.argmax(similarity_total[i])
    best_score = similarity_total[i, best_offer_index]
    best_matches.append((i, best_offer_index, best_score))

df_scores = best_matches

In [18]:
print(df_scores)
print(type(df_scores))  # Debug


       0     1         2
0      0  1016  0.805999
1      1   996  0.766454
2      2  1050  0.769109
3      3   860  0.785300
4      4  1226  0.739943
..   ...   ...       ...
684  684  1044  0.802468
685  685  2072  0.858333
686  686  1591  0.816042
687  687  1813  0.655739
688  688  1680  0.777547

[689 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>


In [26]:
df_scores = pd.DataFrame(best_matches, columns=['candidat_id', 'offre_id', 'score'])


In [28]:
df_scores = df_scores.merge(df_candidats, left_on='candidat_id', right_on='candidat_id', how='left')
df_scores = df_scores.merge(df_offres, on='offre_id', how='left', suffixes=('_candidat', '_offre'))


In [30]:

candidats_meilleures_offres = (
    df_scores
    .sort_values(by=['candidat_id', 'score'], ascending=[True, False])
    .groupby('candidat_id')
    .head(10)
)

In [36]:
candidats_meilleures_offres = candidats_meilleures_offres[
    ['candidat_id', 'Profil', 'Points_forts', 'Compétence', 'Expérience', 'Nom_poste','Contrat_offre', 'Description',
       'Experience', 'Entreprise', 'score']
]


In [37]:
print(candidats_meilleures_offres)

     candidat_id                                             Profil  \
0              0                                     Data Scientist   
1              1  Information Security Management | Digital Tran...   
2              2                            Actuaire / Data Analyst   
3              3                                      Data engineer   
4              4                                     data scientist   
..           ...                                                ...   
684          684                                    Actuaire Junior   
685          685                             Consultante financière   
686          686                                    Energy Engineer   
687          687                        Chef de projet informatique   
688          688        Developper intelligence artificielle junior   

                                          Points_forts  \
0    Application des méthodes de machine learning d...   
1    Adapter les outils de trai

In [None]:
offres_meilleurs_candidats = (
    df_scores
    .sort_values(by=['offre_id', 'score'], ascending=[True, False])
    .groupby('offre_id')
    .head(5)
)


In [35]:
offres_meilleurs_candidats = offres_meilleurs_candidats[
    ['offre_id', 'Nom_poste','Contrat_offre', 'Description',
       'Experience', 'Entreprise', 'candidat_id', 'Profil', 'Points_forts', 'Compétence', 'Expérience', 'score']
]


In [39]:
offres_meilleurs_candidats.to_excel("Data/offres_meilleurs_candidats.xlsx", index=False)
candidats_meilleures_offres.to_excel("Data/candidats_meilleures_offres.xlsx", index=False)
