In [1]:
from utils.utils import load_domain_map, prepare_to_annotate,read_file_content

In [2]:
# Charger la map des domaines similaires
domain_map = load_domain_map("utils/domain_map.json")

In [6]:
# # Appel de la fonction avec chemins relatifs corrects
# prepare_to_annotate(
#     offers_path="../scraping/offres",
#     cvs_path="../scraping/cvs",
#     domain_map=domain_map,
#     output_path="ready_to_annotate.csv",cvs_per_offer=5
# )

📂 Analyse des offres dans : ../scraping/offres

🔎 CV selection summary for domain 'AI_and_ML': 2 same, 1 similar, 2 different, 0 extra → Total: 5
📌 5 CVs added for offer 1 located at path: ../scraping/offres\AI_and_ML\AI Engineer_job_1.txt
🔎 CV selection summary for domain 'AI_and_ML': 2 same, 1 similar, 2 different, 0 extra → Total: 5
📌 5 CVs added for offer 2 located at path: ../scraping/offres\AI_and_ML\AI Researcher_job_1.txt
🔎 CV selection summary for domain 'AI_and_ML': 2 same, 1 similar, 2 different, 0 extra → Total: 5
📌 5 CVs added for offer 3 located at path: ../scraping/offres\AI_and_ML\Artificial Intelligence_job_1.txt
🔎 CV selection summary for domain 'AI_and_ML': 2 same, 1 similar, 2 different, 0 extra → Total: 5
📌 5 CVs added for offer 4 located at path: ../scraping/offres\AI_and_ML\Computer Vision_job_1.txt
🔎 CV selection summary for domain 'AI_and_ML': 2 same, 1 similar, 2 different, 0 extra → Total: 5
📌 5 CVs added for offer 5 located at path: ../scraping/offres\AI_and

In [3]:

import os
import csv
import time
import random
from utils.utils import read_file_content
from utils.deepseek import get_deepseek_score
from utils.groq import get_groq_score_with_retry
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

True

In [4]:
def annotate_data(input_path, output_path, max_annotations=None, api="deepseek"):
    # Read already annotated rows
    existing_rows = set()
    if os.path.exists(output_path):
        with open(output_path, 'r', encoding='utf-8') as f_out:
            reader = csv.reader(f_out)
            next(reader, None)  # Skip header
            for row in reader:
                if row:
                    existing_rows.add((row[0], row[1]))  # (offer_path, cv_path)

    with open(input_path, 'r', encoding='utf-8') as f_in:
        reader = csv.reader(f_in)
        header = next(reader)  # Skip input header

        # Open output file in append mode
        with open(output_path, 'a', newline='', encoding='utf-8') as f_out:
            writer = csv.writer(f_out)

            # Write header only if file was empty
            if os.stat(output_path).st_size == 0:
                writer.writerow(['offer_path', 'cv_path', 'score'])

            counter = len(existing_rows)  # Start counter from already annotated rows

            for row in reader:
                if max_annotations is not None and counter >= max_annotations:
                    print(f"🛑 Maximum of {max_annotations} annotations reached. Stopping.")
                    break

                offer_path, cv_path = row

                if (offer_path, cv_path) in existing_rows:
                    print(f"⏩ Skipping already annotated: {offer_path}, {cv_path}")
                    continue

                print("\n🔎 Offer:", offer_path)
                print("📝 CV:", cv_path)

                offer_text = read_file_content(offer_path)
                cv_text = read_file_content(cv_path)

                if not offer_text or not cv_text:
                    print(f"⚠️ Impossible de lire les fichiers : {offer_path}, {cv_path}")
                    continue

                try:
                    score = None
                    if api == "deepseek":
                        # Call the DeepSeek API to get the score
                        score = get_deepseek_score(offer_text, cv_text)
                    elif api == "groq":
                        # Call the Groq API to get the score
                        score = get_groq_score_with_retry(offer_text, cv_text)
                    print(f"✅ Score obtenu : {score}")
                    writer.writerow([offer_path, cv_path, score])
                    counter += 1

                    # Save after every 10 successful annotations
                    if counter % 5 == 0:
                        f_out.flush()
                        os.fsync(f_out.fileno())
                        print("💾 Progress saved.")

                    # Sleep randomly between 5 and 10 seconds to avoid API limits
                    time.sleep(random.uniform(10, 15))

                except Exception as e:
                    print(f"❌ Erreur lors de l’annotation : {e}")

In [None]:
annotate_data(input_path="ready_to_annotate.csv",output_path="annotated_data.csv",max_annotations=500)

In [6]:
import re
def safe_extract_score(score):
    try:
        # Extract the last float from the string
        matches = re.findall(r"[-+]?\d*\.\d+|\d+", str(score))
        if matches:
            return float(matches[-1])  # Take the last number found
        else:
            return None
    except Exception as e:
        print(f"Error parsing score: {score} -> {e}")
        return None

In [7]:
def reannotate_missing_scores(csv_path,api="deepseek"):
    temp_path = csv_path + '.tmp'

    with open(csv_path, 'r', encoding='utf-8') as f_in, open(temp_path, 'w', newline='', encoding='utf-8') as f_out:
        reader = csv.DictReader(f_in)
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            offer_path = row['offer_path']
            cv_path = row['cv_path']
            score = row['score'].strip()

            # Handle missing score
            if not score or score.lower() == 'nan':
                print(f"🔄 Réannotation (score manquant) : {offer_path} + {cv_path}")

                offer_text = read_file_content(offer_path)
                cv_text = read_file_content(cv_path)

                if not offer_text or not cv_text:
                    print(f"⚠️ Impossible de lire les fichiers : {offer_path}, {cv_path}")
                    writer.writerow(row)  # Réécrit l'ancienne ligne inchangée
                    continue

                try:
                    new_score = None
                    if(api == "deepseek"):
                        # Call the DeepSeek API to get the score
                        new_score = get_deepseek_score(offer_text, cv_text)
                    elif(api == "groq"):
                        # Call the DeepSeek_2 API to get the score
                        new_score = get_groq_score(offer_text, cv_text)

                    print(f"✅ Nouveau score (via deepseek) : {new_score}")
                    row['score'] = new_score
                    time.sleep(random.uniform(2, 5))
                except Exception as e:
                    print(f"❌ Erreur lors de la réannotation : {e}")

            else:
                # Try to safely parse the score
                try:
                    float(score)  # Check if score is a valid float
                except ValueError:
                    print(f"⚙️ Nettoyage score invalide : {score} ({offer_path} + {cv_path})")
                    try:
                        cleaned_score = safe_extract_score(score)
                        print(f"✅ Score nettoyé : {cleaned_score}")
                        row['score'] = cleaned_score
                    except Exception as e:
                        print(f"❌ Erreur lors du nettoyage du score : {e}")
                        # Optional: you can decide to keep old score or mark as 0.0

            writer.writerow(row)

    # Remplacer l'ancien fichier par le nouveau
    os.replace(temp_path, csv_path)
    print("✅ Réannotation terminée et fichier mis à jour !")

In [8]:
reannotate_missing_scores(csv_path="annotated_data.csv")

✅ Réannotation terminée et fichier mis à jour !


openRouter free trial end  for deepseek !!!!

## test OLLAMA

In [13]:
from utils.ollama import get_mistral_score

In [None]:
offer = "Ingénieur IA spécialisé en vision par ordinateur et deep learning. Expérience en Python, PyTorch et traitement d’images requise."
cv = "Mohamed OUABBI, étudiant en ingénierie digitale, travaille sur l’OCR, YOLO, PyTorch et le deep learning."

score = get_mistral_score(offer, cv)

In [6]:
print("Score de similarité :", score)

Score de similarité : 0.95


ollama mistral give an amazing score but it's very slow ! 

## try Firworks :

In [9]:
from utils.groq import get_groq_score

In [10]:
offer = "Ingénieur IA spécialisé en vision par ordinateur et deep learning. Expérience en Python, PyTorch et traitement d’images requise."
cv = "Mohamed OUABBI, étudiant en ingénierie digitale, travaille sur l’OCR, YOLO, PyTorch et le deep learning."

score = get_groq_score(offer, cv)

In [11]:
score

'0.7'

Groq is much better. While Ollama takes around 40 seconds for one matching, Groq completes it in just 0.5 seconds.

#### annotate with Groq

In [5]:
annotate_data(input_path="ready_to_annotate.csv",output_path="annotated_data2.csv",max_annotations=1000,api="groq")

⏩ Skipping already annotated: ../scraping/offres\DevOps\Kubernetes_job_1.txt, ../scraping/cvs\Data_Engineering\Data_Engineering_cv_25.txt
⏩ Skipping already annotated: ../scraping/offres\DevOps\Ansible_job_1.txt, ../scraping/cvs\DevOps\DevOps_cv_134.txt
⏩ Skipping already annotated: ../scraping/offres\DevOps\DevOps_job_1.txt, ../scraping/cvs\DevOps\DevOps_cv_42.txt
⏩ Skipping already annotated: ../scraping/offres\Web_Development\RESTful API_job_1.txt, ../scraping/cvs\Web_Development\Web_Development_cv_25.txt
⏩ Skipping already annotated: ../scraping/offres\Web_Development\GraphQL_job_1.txt, ../scraping/cvs\Web_Development\Web_Development_cv_22.txt
⏩ Skipping already annotated: ../scraping/offres\Cybersecurity\Ransomware_job_1.txt, ../scraping/cvs\Web_Development\Web_Development_cv_14.txt
⏩ Skipping already annotated: ../scraping/offres\Networking\Network Administration_job_1.txt, ../scraping/cvs\Cloud_Computing\Cloud_Computing_cv_1.txt
⏩ Skipping already annotated: ../scraping/offres\A

KeyboardInterrupt: 

In [None]:
# 📚 1. Importations
import pandas as pd
from pathlib import Path

# 📂 2. Charger le CSV initial
df = pd.read_csv('annotated_data2.csv')  # Remplace par ton fichier existant

# 📄 3. Fonction pour lire un fichier texte
def read_text_file(path):
    try:
        with open(Path(path), 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Erreur lecture fichier {path}: {e}")
        return ""

# 🛠️ 4. Appliquer pour lire l'offre et le CV
df['offer_text'] = df['offer_path'].apply(read_text_file)
df['cv_text'] = df['cv_path'].apply(read_text_file)

# 🧹 5. Garder uniquement offer_text, cv_text, score
df_final = df[['offer_text', 'cv_text', 'score']]

# 💾 6. Sauvegarder dans data/data.csv
output_path = Path('../data/labeled_data.csv')
output_path.parent.mkdir(exist_ok=True)  # Créer dossier 'data' s'il n'existe pas
df_final.to_csv(output_path, index=False)

print("✅ Préparation terminée ! Fichier sauvegardé dans data/labled_data.csv")


Erreur lecture fichier ../scraping/cvs\machine_learning\cv_ml_3.txt: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Erreur lecture fichier ../scraping/cvs\machine_learning\cv_ml_10.txt: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
✅ Préparation terminée ! Fichier sauvegardé dans data/labled_data.csv


: 