<a href="https://colab.research.google.com/github/pamslover/pamellaPfe/blob/main/Fine_tuningModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Packages

In [None]:
!pip install langid

In [None]:
!pip install gensim pyLDAvis


In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch]

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize


In [None]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch

# LDA

In [None]:
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

# Charger les CVs
data_cv = pd.read_csv("/content/drive/MyDrive/projet pfe/CvClean.csv")

# Tokenisation et prétraitement
texts = [text.split() for text in data_cv['Resume']]

# Création du dictionnaire et du corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Appliquer LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Afficher les thèmes
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualisation
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


# word2vec et bert

## Fine-tuning de word2vec

In [None]:
df = pd.read_csv('/content/drive/MyDrive/projet pfe/CvClean.csv')
jd = pd.read_csv("/content/drive/MyDrive/projet pfe/job.csv")


In [None]:
from langid.langid import LanguageIdentifier, model

# detection de la langue de texte
class LanguageCheck:

    def __init__(self):
        self.text = None
        self.language = None

    def override(self, text=None):
        self.text = text
        identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        self.language, _ = identifier.classify(self.text)

        return self.language

In [None]:
cvs = []
i = 0
for i in tqdm(range(len(df['Resume']))):
  text = str(df['Resume'][i]).strip().lower()
  tokens = word_tokenize(text) # Tokenize the text
  txt = [token for token in tokens if not token in stopwords.words('english') and token.isalpha()] # Use stopwords.words()
  txt = ' '.join(w for w in txt)
  cvs.append(txt)
  i+=1

In [None]:

jobs = []
i = 0
for i in tqdm(range(len(jd['job']))):
  text = str(jd['job'][i]).strip().lower()
  check = LanguageCheck()
  lang = check.override(text)
  tokens = word_tokenize(text)
  if lang == 'fr':
    txt = [token for token in tokens if not token in stopwords.words('french') and token.isalpha()] # Use stopwords.words()
    txt = ' '.join(w for w in txt)
  else:
    txt = [token for token in tokens if not token in stopwords.words('english') and token.isalpha()] # Use stopwords.words()
    txt = ' '.join(w for w in txt)
  jobs.append(txt)
  i+=1

In [None]:
corpus = cvs + jobs

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [str(w).lower() for w in corpus]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [None]:
path = "/content/drive/MyDrive/projet pfe/word2vec/GoogleNews-vectors-negative300.bin"

model = Word2Vec(min_count=1, vector_size=300)
model.build_vocab(sentences_tokenized)
model.wv.vectors_lockf = np.ones(len(model.wv))
total_exemple = model.corpus_count
model.wv.intersect_word2vec_format(path, binary=True, lockf=1.0)
model.train(sentences_tokenized, total_examples=total_exemple, epochs=5)

In [None]:
model.save("/content/drive/MyDrive/projet pfe/word2vec/word2vec_gensim_Cv_job.model")

model.wv.save_word2vec_format("/content/drive/MyDrive/projet pfe/word2vec/word2vec_cv_job.model")

In [None]:
w2v =KeyedVectors.load("/content/drive/MyDrive/projet pfe/word2vec/word2vec_gensim_Cv_job.model")

In [None]:
w2v.wv.most_similar("nlp")

## Fine-tuning de BERT sur les CVs

In [None]:
corpus_cv = df['Resume'].tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
inputs = tokenizer(str(corpus_cv), return_tensors='pt', max_length=512, truncation=True, padding=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


# Créer un Dataset et un DataLoader
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

dataset = TextDataset(inputs)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',          # sortie du répertoire
    overwrite_output_dir=True,       # écraser le contenu du répertoire de sortie
    num_train_epochs=3,              # nombre d'époques d'entraînement
    per_device_train_batch_size=8,   # taille du lot d'entraînement
    save_steps=10_000,               # sauvegarder les modèles tous les 10 000 pas
    save_total_limit=2,              # ne conserver que les 2 derniers modèles sauvegardés
)

# Créer l'entraîneur
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Entraîner le modèle
trainer.train()

# Sauvegarder le modèle fine-tuné
model.save_pretrained("/content/drive/MyDrive/projet pfe/BERT/model_bert_cv")
tokenizer.save_pretrained("/content/drive/MyDrive/projet pfe/BERT/token_bert_cv")


## Fine-tuning de BERT sur les offres

In [None]:
corpus_jb = jd['job'].tolist()

# Télécharger le tokenizer et le modèle pré-entraîné
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
inputs = tokenizer(str(corpus_jb), return_tensors='pt', max_length=512, truncation=True, padding=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

dataset = TextDataset(inputs)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',          # sortie du répertoire
    overwrite_output_dir=True,       # écraser le contenu du répertoire de sortie
    num_train_epochs=3,              # nombre d'époques d'entraînement
    per_device_train_batch_size=8,   # taille du lot d'entraînement
    save_steps=10_000,               # sauvegarder les modèles tous les 10 000 pas
    save_total_limit=2,              # ne conserver que les 2 derniers modèles sauvegardés
)

# Créer l'entraîneur
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Entraîner le modèle
trainer.train()


# Sauvegarde du modele
model.save_pretrained("/content/drive/MyDrive/projet pfe/BERT/model_bert_job")
tokenizer.save_pretrained("/content/drive/MyDrive/projet pfe/BERT/token_bert_job")


# LDA

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Charger les données (ajustez les chemins et les noms de colonnes en fonction de vos données)
df = pd.read_csv('/content/drive/MyDrive/projet pfe/CvClean.csv')
jd = pd.read_csv("/content/drive/MyDrive/projet pfe/job.csv")

# Fusionner les CVs et les offres d'emploi pour créer un corpus unique
documents = df['Resume'].tolist() + jd['job'].tolist()

corpuss = jobs + cvs


In [None]:
# Créer un dictionnaire
dictionary = corpora.Dictionary(corpuss)

# Filtrer les tokens rares et fréquents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Créer un corpus (bag of words)
corpus = [dictionary.doc2bow(doc) for doc in corpuss]


In [None]:
from gensim.models import LdaModel

# Définir le nombre de thèmes
num_topics = 10

# Entraîner le modèle LDA
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)


In [None]:
# Obtenir les distributions de thèmes pour les CVs
corpus_cv = [dictionary.doc2bow(preprocess(doc)) for doc in df_cv['Resume'].tolist()]
lda_cv = [lda_model.get_document_topics(bow) for bow in corpus_cv]

# Obtenir les distributions de thèmes pour les offres d'emploi
corpus_job = [dictionary.doc2bow(preprocess(doc)) for doc in df_job['JobDescription'].tolist()]
lda_job = [lda_model.get_document_topics(bow) for bow in corpus_job]

# Convertir les distributions de thèmes en matrices numpy
import numpy as np

def topics_to_matrix(lda_topics, num_topics):
    matrix = np.zeros((len(lda_topics), num_topics))
    for i, topics in enumerate(lda_topics):
        for topic, prob in topics:
            matrix[i, topic] = prob
    return matrix

lda_topic_distributions_cv = topics_to_matrix(lda_cv, num_topics)
lda_topic_distributions_job = topics_to_matrix(lda_job, num_topics)

# Afficher les distributions de thèmes pour le premier CV et la première offre d'emploi
print(lda_topic_distributions_cv[0])
print(lda_topic_distributions_job[0])


In [None]:
# Exemple d'embeddings BERT ou Word2Vec (remplacez par vos propres embeddings)
bert_embeddings_cv = np.random.rand(len(df_cv), 768)
bert_embeddings_job = np.random.rand(len(df_job), 768)

# Fusionner les embeddings avec les proportions de thèmes
combined_embeddings_cv = np.hstack((bert_embeddings_cv, lda_topic_distributions_cv))
combined_embeddings_job = np.hstack((bert_embeddings_job, lda_topic_distributions_job))


In [None]:
from sklearn.cluster import KMeans

# Appliquer KMeans sur les proportions de thèmes LDA
num_clusters = 10  # Définir le nombre de clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters_cv = kmeans.fit_predict(lda_topic_distributions_cv)
clusters_job = kmeans.fit_predict(lda_topic_distributions_job)
