In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gradio as gr

In [12]:
# Chargement et nettoyage des données
fichier_csv = "fake_job_postings 2.csv" 
df = pd.read_csv(fichier_csv)
df = df[['description', 'required_experience']].dropna()
df['etiquette'] = df['required_experience'].astype('category').cat.codes
mapping_etiquettes = dict(enumerate(df['required_experience'].astype('category').cat.categories))

In [13]:
mapping_etiquettes

{0: 'Associate',
 1: 'Director',
 2: 'Entry level',
 3: 'Executive',
 4: 'Internship',
 5: 'Mid-Senior level',
 6: 'Not Applicable'}

In [15]:
# Tokenisation BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ExperienceDataset(Dataset):
    def __init__(self, textes, etiquettes, tokenizer, longueur_max=256):
        self.encodages = tokenizer(textes, truncation=True, padding=True, max_length=longueur_max)
        self.etiquettes = etiquettes

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodages['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodages['attention_mask'][idx]),
            'labels': torch.tensor(self.etiquettes[idx])
        }

    def __len__(self):
        return len(self.etiquettes)

In [16]:
# Division des données
textes_train, textes_test, etiquettes_train, etiquettes_test = train_test_split(
    df['description'].tolist(),
    df['etiquette'].tolist(),
    test_size=0.2,
    stratify=df['etiquette'],
    random_state=42
)

jeu_entraînement = ExperienceDataset(textes_train, etiquettes_train, tokenizer)
jeu_test = ExperienceDataset(textes_test, etiquettes_test, tokenizer)

chargeur_entraînement = DataLoader(jeu_entraînement, batch_size=8, shuffle=True)
chargeur_test = DataLoader(jeu_test, batch_size=2)

In [17]:
# Chargement du modèle
modele = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mapping_etiquettes))
optimiseur = AdamW(modele.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modele = modele.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Entraînement du modèle
modele.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
for epoch in range(3):
    for lot in chargeur_entraînement:
        optimiseur.zero_grad()
        sorties = modele(input_ids=lot['input_ids'].to(device), 
                         attention_mask=lot['attention_mask'].to(device), 
                         labels=lot['labels'].to(device))
        perte = sorties.loss
        perte.backward()
        optimiseur.step()
    print(f"Époque {epoch+1} terminée.")

Époque 1 terminée.
Époque 2 terminée.
Époque 3 terminée.


In [20]:
# Évaluation
modele.eval()
predictions, vrais = [], []
with torch.no_grad():
    for lot in chargeur_test:
        sortie = modele(input_ids=lot['input_ids'].to(device), 
                        attention_mask=lot['attention_mask'].to(device))
        pred = torch.argmax(sortie.logits, dim=1)
        predictions.extend(pred.tolist())
        vrais.extend(lot['labels'].tolist())

print("Précision sur l'ensemble de test :", accuracy_score(vrais, predictions))

Précision sur l'ensemble de test : 0.6412742382271468


In [23]:
def predire_experience(description):
    encodage = tokenizer(description, return_tensors="pt", truncation=True, padding=True, max_length=256)
    encodage = {k: v.to(device) for k, v in encodage.items()}
    sortie = modele(**encodage)
    prediction = torch.argmax(sortie.logits, dim=1).item()
    return mapping_etiquettes[prediction]


In [24]:
# Exemple de description
texte_test = """
We are looking for a passionate software engineer to work on web applications using Python and React. 
You will collaborate with a team of experienced developers.
"""

# Prédiction
niveau = predire_experience(texte_test)

print("➡️ Niveau d'expérience prédit :", niveau)

➡️ Niveau d'expérience prédit : Mid-Senior level


In [25]:
# Interface Gradio
interface = gr.Interface(
    fn=predire_experience,
    inputs=gr.Textbox(lines=8, placeholder="Entrez la description du poste ici..."),
    outputs="text",
    title="Prédicteur d'expérience requise (BERT)",
    description="Cette application prédit le niveau d'expérience requis à partir d'une description de poste."
)

interface.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


