In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
df=pd.read_csv("JOB_IT.csv")

In [3]:
df.shape

(371522, 23)

In [4]:
def format_job_text(row):
    text = f"""{row['Job Title']}.
    Description du poste : {row['Job Description']}.
    Les responsabilités sont : {row['Responsibilities']}.
    Compétences requises : {row['skills']}"""
    
    return text.strip()

# Appliquer la transformation sur chaque ligne
df['full_job_text'] = df.apply(format_job_text, axis=1)

#### WORD2VEC

In [5]:
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from gensim.models import FastText
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# Download necessary resources (only the first time)
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Prétraitement
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Suppression des caractères non alphabétiques
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 2 and len(word) < 20]
    return tokens

In [7]:
def train_fasttext_model(all_job_des, all_cv_texts, vector_size=100, window=5, min_count=1, workers=4):
   
    # Preprocess all job descriptions and CVs
    tokenized_jobs = [preprocess_text(text) for text in all_job_des]
    tokenized_cvs = [preprocess_text(text) for text in all_cv_texts]

    # Combine all tokenized texts into one corpus
    corpus = tokenized_jobs + tokenized_cvs

    # Train the FastText model
    model = FastText(sentences=corpus, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    
    return model

In [8]:
def get_average_word_vector(tokens, model):
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)  # Return a zero vector if no valid words
    return np.mean(word_vectors, axis=0)

In [9]:

def jd_cv_match_fastText(cv_details, jd_text, model):

    # Tokenize the JD text
    tokenized_jd = word_tokenize(jd_text.lower())

    # Compute the average vector for the JD
    jd_vector = get_average_word_vector(tokenized_jd, model)

    # Compute similarity scores for each CV
    results = []
    for cv in cv_details:
        tokenized_cv = word_tokenize(cv['text'].lower())
        cv_vector = get_average_word_vector(tokenized_cv, model)
        similarity_score = cosine_similarity([jd_vector], [cv_vector])[0][0]

        results.append({
            'name': cv['name'],
            'path': cv['path'],
            'similarity_score': similarity_score
        })

    # Sort results by similarity score (descending)
    results.sort(key=lambda x: x['similarity_score'], reverse=True)

    # Add rank to results
    ranked_results = [{'rank': i + 1, 'name': result['name'], 'path': result['path'], 'similarity_score': result['similarity_score']} for i, result in enumerate(results)]

    return ranked_results


In [10]:
all_job_des=df['full_job_text']
cvs_df = pd.read_csv('UpdatedResumeDataSet.csv')
all_cv_texts = cvs_df.Resume


In [11]:
# Train Word2Vec model
fastText_model = train_fasttext_model(all_job_des, all_cv_texts)

In [12]:
cv_details_1 = [
    {
        'name': 'Amine B.',
        'path': '/path/to/amine_b_cv.pdf',
        'text': "Ingénieur en développement web avec expertise en React et Node.js. "
                "Expérience dans la création d'applications web modernes et scalables. "
                "Maîtrise des API REST, Redux et déploiement sur AWS."
    },
    {
        'name': 'Karim M.',
        'path': '/path/to/sarah_m_cv.pdf',
        'text': "Spécialiste en science des données avec expertise en machine learning. "
                "Expérience dans le développement de modèles prédictifs avec TensorFlow et PyTorch. "
                "Compétences en manipulation de bases de données SQL et NoSQL."
    },
    {
        'name': 'Omar K.',
        'path': '/path/to/omar_k_cv.pdf',
        'text': "Développeur full-stack avec expérience en React, Node.js et MongoDB. "
                "Travail sur des plateformes SaaS et e-commerce, intégration d'API tierces. "
                "Passionné par le code propre et les méthodologies Agile."
    }
]

# Exemple d’offre d'emploi (JD)
jd_text_1 = (
    "Nous recherchons un développeur web expérimenté en React et Node.js. "
    "Le candidat idéal doit maîtriser Redux, les API REST et avoir une expérience en déploiement cloud (AWS, GCP). "
    "Une connaissance des bases de données SQL et NoSQL est un plus."
)


In [13]:
#jd_text="We are looking for an experienced Agriculture nvironments"
ranked_cvs = jd_cv_match_fastText(cv_details_1, jd_text_1,fastText_model)

# Print the ranked results
for result in ranked_cvs:
    print(f"Name: {result['name']}, Similarity Score: {result['similarity_score']}")

Name: Amine B., Similarity Score: 0.697260320186615
Name: Omar K., Similarity Score: 0.5535682439804077
Name: Karim M., Similarity Score: 0.5198121070861816


#### Evalute 

In [14]:
!pip install rouge-score





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_average_word_vector(tokens, model):
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)  # Return a zero vector if no valid words
    return np.mean(word_vectors, axis=0)

def jd_cv_match_fastText(cvs_df, jd_texts, model):
    """
    Given 100 JDs (in `jd_texts`), find top 10 CVs for each JD using Word2Vec.
    """
    all_results = []

    for jd_text in jd_texts:
        # Tokenize the JD text
        tokenized_jd = word_tokenize(jd_text.lower())

        # Compute the average vector for the JD
        jd_vector = get_average_word_vector(tokenized_jd, model)

        # Compute similarity scores for each CV in the DataFrame
        results = []
        for index, row in cvs_df.iterrows():
            # Tokenize the resume
            tokenized_cv = word_tokenize(row['Resume'].lower())
            cv_vector = get_average_word_vector(tokenized_cv, model)

            # Compute cosine similarity
            similarity_score = cosine_similarity([jd_vector], [cv_vector])[0][0]

            results.append({
                'similarity_score': similarity_score,
                'resume_text': row['Resume']  # Store the resume text instead of name/path
            })

        # Sort results by similarity score (descending)
        results.sort(key=lambda x: x['similarity_score'], reverse=True)

        # Keep top 10 CVs
        top_10_results = [{'rank': i + 1, 'resume_text': result['resume_text'], 'similarity_score': result['similarity_score']} for i, result in enumerate(results[:10])]

        # Append to all_results
        all_results.append(top_10_results)

    return all_results


In [16]:
from rouge_score import rouge_scorer
import numpy as np

def compute_average_rouge_recall(jd_texts, ranked_results):
    """
    Compute the average recall values of ROUGE-1 for the top 10 matched resumes across 100 JDs.

    jd_texts: List of 100 Job Descriptions (JDs).
    ranked_results: List of lists, where each sublist contains top 10 CVs for a JD.

    Returns:
        Average ROUGE-1 recall score across 100 JDs.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    rouge_recall_values = []

    for jd_text, top_10_cvs in zip(jd_texts, ranked_results):
        jd_reference = jd_text  # JD is the reference
        recall_scores = []

        for cv in top_10_cvs:
            cv_text = cv['resume_text']  # Extract CV text
            rouge_scores = scorer.score(jd_reference, cv_text)
            recall_scores.append(rouge_scores['rouge1'].recall)  # Store recall score

        # Compute the average recall for this JD's top 10 resumes
        avg_recall_for_jd = np.mean(recall_scores)
        rouge_recall_values.append(avg_recall_for_jd)

    # Compute the overall average recall across all 100 JDs
    overall_avg_rouge_recall = np.mean(rouge_recall_values)
    
    return overall_avg_rouge_recall


In [17]:
import pandas as pd

def select_random_jds(df, num_samples=100, seed=42):
    
    sampled_jds = df.sample(n=num_samples, random_state=seed)  # Randomly select 100 JDs
    return sampled_jds['full_job_text'].tolist()  # Return as a list of text

# Example Usage
random_jds = select_random_jds(df)  # Select 100 random JDs


In [18]:
fastText_model

<gensim.models.fasttext.FastText at 0x1cf8c427020>

In [19]:
ranked_results=jd_cv_match_fastText(cvs_df, random_jds, fastText_model)

In [20]:
overall_avg_rouge_recall=compute_average_rouge_recall(random_jds, ranked_results)

In [21]:
overall_avg_rouge_recall

0.46877189748656956