In [None]:
!pip install gradio rake-nltk PyPDF2 pymupdf4llm nltk groq

In [None]:
import gradio as gr
import pandas as pd
import pymupdf4llm
import numpy as np
import re
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

nltk.download('stopwords')
nltk.download('punkt')

nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load and preprocess jobs data
jobs_data = pd.read_csv("/content/Jobs_data_new_python.csv")
jobs_data = jobs_data.dropna(subset=["Job_txt", "job-title"]).reset_index(drop=True)

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower()

jobs_data["Job_txt_cleaned"] = jobs_data["Job_txt"].apply(preprocess_text)
job_descriptions_cleaned = jobs_data["Job_txt_cleaned"].tolist()
job_titles = jobs_data["job-title"].tolist()

# Initialize Rake
rake = Rake()

def extract_keywords(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

def calculate_base_similarity(cv_keywords, job_descriptions):
    cv_text = " ".join(cv_keywords)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(job_descriptions + [cv_text])
    return cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]

def calculate_interest_similarity(interest, job_titles):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(job_titles + [interest])
    return cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]

def calculate_soft_skill_similarity(soft_skills, job_descriptions):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(job_descriptions + [soft_skills])
    return cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]

def combine_scores(base_scores, interest_scores, soft_skills_scores,
                   interest_weight=0.4, soft_skills_weight=0.2):
    base_weight = 1 - interest_weight - soft_skills_weight
    return (interest_weight * interest_scores) + \
           (soft_skills_weight * soft_skills_scores) + \
           (base_weight * base_scores)

def process_inputs(cv_file, interests, soft_skills):
    # Extract text from CV
    cv_markdown = pymupdf4llm.to_markdown(cv_file, page_chunks=True)
    cv_text = " ".join([chunk['text'] for chunk in cv_markdown])

    # Extract keywords
    cv_keywords = extract_keywords(cv_text)

    # Calculate similarities
    base_scores = calculate_base_similarity(cv_keywords, job_descriptions_cleaned)
    print("Base scores:", base_scores)

    interest_scores = calculate_interest_similarity(interests, job_titles)

    if soft_skills.strip():
        soft_skills_scores = calculate_soft_skill_similarity(soft_skills, job_descriptions_cleaned)
    else:
        soft_skills_scores = np.zeros_like(base_scores)

    # Combine scores
    combined_scores = combine_scores(base_scores, interest_scores, soft_skills_scores)

    # Create results dataframe
    results = jobs_data.copy()
    results["Match Score"] = combined_scores
    results = results.sort_values("Match Score", ascending=False).head(10)

    return results[["job-title", "company", "location", "Match Score"]]


In [None]:
cv_file="/content/Ismail-Oubah-EnglishCV.pdf"

In [None]:
interests="Machine Learning, Data Analysis, Cloud Computing"

In [None]:
soft_skills="Team Leadership, Public Speaking, Time Management"

In [None]:
result=process_inputs(cv_file, interests, soft_skills)

Processing /content/Ismail-Oubah-EnglishCV.pdf...
Base scores: [0.01420772 0.04080659 0.01419871 0.04896231 0.01819197 0.01503125
 0.01442724 0.01227876 0.05480118 0.00577657 0.03463781 0.01183512
 0.00809821 0.02057038 0.02845185 0.04673173 0.00299636 0.01385939
 0.02438939 0.01420772 0.04080659 0.01419871 0.04896231 0.01819197
 0.01503125 0.01442724 0.01227876 0.05480118 0.00577657 0.03463781
 0.01183512 0.00809821 0.02057038 0.02845185 0.04673173 0.00299636
 0.01385939 0.02438939 0.02286805 0.01694891 0.02269844 0.02205852
 0.02152584 0.03221842 0.0681525  0.01163456 0.06814935 0.06735825
 0.06736248 0.01898978 0.02685288 0.02817086 0.01239296 0.04000192
 0.05173441 0.01309528 0.01308111 0.00889626 0.08768648 0.01891137
 0.00631102 0.03593707 0.03613241 0.0239789  0.03061403 0.05211745
 0.00838116 0.02368204 0.01814852 0.01446327 0.09361828 0.01610436
 0.0059946  0.0215342  0.01960281 0.0681525  0.0681525  0.0681525
 0.0681525  0.07032398 0.02311389 0.01138298 0.02378224 0.16849664


In [None]:
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as interface:
    gr.Markdown("# 🧑💼 Job Recommendation System")

    with gr.Row():
        with gr.Column():
            cv_upload = gr.File(label="Upload your CV (PDF)", file_types=[".pdf"])
            interests_input = gr.Textbox(
                label="Your Professional Interests",
                placeholder="e.g., Machine Learning, Data Analysis, Cloud Computing"
            )
            soft_skills_input = gr.Textbox(
                label="Your Soft Skills (Optional)",
                placeholder="e.g., Team Leadership, Public Speaking, Time Management"
            )
            submit_btn = gr.Button("Find Matching Jobs", variant="primary")

        with gr.Column():
            results_table = gr.DataFrame(
                label="Recommended Jobs",
                headers=["Job Title", "Company", "Location", "Match Score"],
                datatype=["str", "str", "str", "number"]
            )

    submit_btn.click(
        fn=process_inputs,
        inputs=[cv_upload, interests_input, soft_skills_input],
        outputs=results_table
    )

interface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a6ba274a50a9c9ad43.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Processing /tmp/gradio/357f7f813d5bc7d71a03e37db89c8318ece06936054ebba7191fe84df76752d4/Ismail-Oubah-EnglishCV.pdf...
Base scores: [0.01420772 0.04080659 0.01419871 0.04896231 0.01819197 0.01503125
 0.01442724 0.01227876 0.05480118 0.00577657 0.03463781 0.01183512
 0.00809821 0.02057038 0.02845185 0.04673173 0.00299636 0.01385939
 0.02438939 0.01420772 0.04080659 0.01419871 0.04896231 0.01819197
 0.01503125 0.01442724 0.01227876 0.05480118 0.00577657 0.03463781
 0.01183512 0.00809821 0.02057038 0.02845185 0.04673173 0.00299636
 0.01385939 0.02438939 0.02286805 0.01694891 0.02269844 0.02205852
 0.02152584 0.03221842 0.0681525  0.01163456 0.06814935 0.06735825
 0.06736248 0.01898978 0.02685288 0.02817086 0.01239296 0.04000192
 0.05173441 0.01309528 0.01308111 0.00889626 0.08768648 0.01891137
 0.00631102 0.03593707 0.03613241 0.0239789  0.03061403 0.05211745
 0.00838116 0.02368204 0.01814852 0.01446327 0.09361828 0.01610436
 0.0059946  0.0215342  0.01960281 0.0681525  0.0681525  0.0681525

##Test with LLM

In [None]:
prompt_cv="""

You are a highly intelligent assistant tasked with analyzing CVs and creating concise, structured summaries optimized for comparing with job descriptions.

Your goal is to extract and organize key details into specific categories. Structure your response as follows:

1. **Professional Summary**: Provide a three-sentences max overview of the candidate’s career focus, expertise, and achievements.
2. **Key Skills and Expertise**: Enumerate the candidate's main skills, including technical skills (e.g., programming languages, tools,frameworks, services) and non-technical skills (e.g., leadership, communication).
3. **Work Experience**:
   - For each role, include:
     - Job Title.
     - Organization Name.
     - Duration of Employment.
     - Key responsibilities and accomplishments.
4. **Technologies and Tools**: List every software, programming languages, frameworks,API service, and tools the candidate has experience with.
5. **Education**: Summarize degrees, fields of study, and institutions attended.
6. **Certifications and Training**: Highlight any certifications, courses, or training programs completed.
7. **Languages**: Mention languages the candidate knows and their proficiency levels.
8. **Projects**: Include significant projects or achievements relevant to the candidate's profile.
9. **Keywords**: Extract important keywords that characterize the candidate’s expertise.

Focus on clarity and precision in each section to ensure a well-structured summary. Here's the CV content:

{content}

Respond in the requested structured format.

"""


In [None]:
GROQ_API_KEY="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"

from groq import Groq

def Summarize_job_text(content,prompt):
    """
    Summarize a table using an LLM.
    """
    client = Groq(api_key=GROQ_API_KEY)

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "you are a helpful assistant."
            },
            {
                "role": "user",
                "content": prompt.format(content=content),
            }
        ],
        model="llama3-8b-8192",
        temperature=0.2,
        max_tokens=1024,
        top_p=1,
    )


    return chat_completion.choices[0].message.content

In [None]:
cv_file="/content/Ismail-Oubah-EnglishCV.pdf"
cv_markdown = pymupdf4llm.to_markdown(cv_file, page_chunks=True)



Processing /content/Ismail-Oubah-EnglishCV.pdf...


In [None]:
cv_text = " ".join([chunk['text'] for chunk in cv_markdown])

In [None]:
cv_text

'## AI developer\n\n[oubah.ismail.07@gmail.com](mailto:oubah.ismail.07@gmail.com)\n\n[+212628873435](tel:+212628873435)\n\n[https://www.fiverr.com/ismailobh](https://www.fiverr.com/ismailobh)\n\n[github.com/ismailox1000](https://github.com/ismailox1000)\n\nhttps://huggingface.co/obh07\n\n\n### EDUCATION\n\n**Master Degree in Computer Engineering,**\nIstanbul Aydin University\n2022 – 2024 | Istanbul, Turkey\n\n - hesis About "Advanced RAG Multilingual\nSemantic Retrieval across Document Types\nby Finetuning Transformer Based Language\nModels and OCR Integration”.\n\n**Bachelor in Maintenance of Embedded**\n**Electronic Systems, EST**\n2020 – 2021 | Salé, Morocco\n\n**Technical University degree, EST**\n2018 – 2020 | Salé, Morocco\n\n### TECHNICAL SKILLS\n\n**Programming Languages : Python,**\nJavaScript, SQL, HTML, CSS, Bootstrap\n**Data Analysis & Data Science : Pandas,**\nNumPy, Scikit-Learn, Matplotlib, Seaborn,\nOpenCV, Power BI\n**Machine Learning : Régression,**\nClassification, C

In [None]:
test2=Summarize_job_text(cv_text,prompt_cv)

In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.0-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import JinaEmbeddings, HuggingFaceEmbeddings
from pathlib import Path
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore

def create_vectorstore( model="jina", db_name="job_matching_db", jina_api_key=None):
    """
    Create a vector store from a list of documents for similarity search.

    Args:
        documents (list of str): List of text documents to embed and store.
        model (str): Embedding model to use ('jina' or 'e5').
        db_name (str): Name of the database for storage.
        jina_api_key (str, optional): API key for Jina embeddings.

    Returns:
        Chroma: Initialized vectorstore.
    """
    # Define available embedding models
    embedding_models = {
        "e5": "obh07/multilingual-e5-base-dolly-15k",
        "jina": "jina-embeddings-v3"
    }

    # Select the embedding function
    if model == "e5":
        embedding_function = HuggingFaceEmbeddings(
            model_name=embedding_models["e5"],
            show_progress=True
        )
    elif model == "jina" and jina_api_key:
        embedding_function = JinaEmbeddings(
            jina_api_key=jina_api_key,
            model_name=embedding_models["jina"]
        )
    else:
        raise ValueError("Invalid model specified. Choose 'e5' or 'jina' with a valid API key for Jina.")

    # Set up persistence directory for the vectorstore
    persist_directory = f"./chroma_data_{db_name}"
    Path(persist_directory).mkdir(parents=True, exist_ok=True)

    # Initialize and populate the vectorstore
    vectorstore = Chroma(
        collection_name=f"job_documents_{db_name}",
        embedding_function=embedding_function,
        persist_directory=persist_directory,
    )

    return vectorstore

def create_retriever(vectorstore):
    store = InMemoryStore()
    id_key = "doc_id"

    retriever = MultiVectorRetriever(
      vectorstore=vectorstore,
      docstore=store,
      id_key=id_key,search_kwargs={"k": 10},
  )
    return retriever

In [None]:
jobs_data = pd.read_csv("/content/Jobs_data_new_python.csv")
jobs_data = jobs_data.dropna(subset=["Job_txt", "job-title"]).reset_index(drop=True)
json_data = jobs_data.to_dict(orient="records")


In [None]:
json_data[0]

{'Job_ID': 4041424674,
 'Job_txt': 'Internship for Webmaster (M/F) (Stage pour Webmaster (H/F))\nORIENTAL GROUP / Marrakech Casablanca, Casablanca-Settat, Morocco\n1 month ago 124 applicants\nSee who ORIENTAL GROUP / Marrakech has hired for this role\nApply\nJoin or sign in to find your next job\nJoin to apply for the Internship for Webmaster (M/F) (Stage pour Webmaster (H/F)) role at ORIENTAL GROUP / Marrakech\nNot you?\nRemove photo\nFirst name\nLast name\nEmail\nPassword (6+ characters)\nBy clicking Agree & Join, you agree to the LinkedIn User Agreement, Privacy Policy and Cookie Policy.\nContinue Agree & Join\nor\nApply on company website\nSecurity verification\nAlready on LinkedIn? Sign in\nSave\nSave job\nSave this job with your existing LinkedIn profile, or create a new one.\nYour job seeking activity is only visible to you.\nEmail\nContinue\nWelcome back\nSign in to save Internship for Webmaster (M/F) (Stage pour Webmaster (H/F)) at ORIENTAL GROUP / Marrakech.\nEmail or phone\n

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.11.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.50b0-py3-none-any.whl.metadata (2.1 kB)
Collecting pypika>=0.48.9 (from chromadb)


In [None]:
import uuid
from langchain.schema.document import Document



jina_api_key="jina_a336488c98c6463180a2494f62739952boWqK6nYZSwFNd-Z6JJJ-Gvs1sqW"  # Replace with your API key

vectorstore = create_vectorstore( model="jina", db_name="job_matching_3", jina_api_key=jina_api_key)


In [None]:
retriever=create_retriever(vectorstore)

In [None]:
def add_documents_to_retriever(retriever, job_elements=None):
    """
    Adds table and text documents to the retriever and its docstore.

    Args:
        retriever (MultiVectorRetriever): The retriever object.
        job_elements (list, optional): A list of job elements to add, where each element is expected to be a dictionary.
    """
    if job_elements:
        # Generate unique doc IDs for job elements
        job_doc_ids = [str(uuid.uuid4()) for _ in job_elements]

        # Prepare documents with 'summary' as page_content and the rest as metadata
        job_documents = []
        for doc_id, job in zip(job_doc_ids, job_elements):
            if isinstance(job, dict) and 'Job_txt' in job:
                metadata = {k: v for k, v in job.items() if k != 'Job_txt'}  # Exclude 'summary' from metadata
                metadata["doc_id"] = doc_id  # Add the unique doc_id to metadata
                job_documents.append(Document(page_content=job['Job_txt'], metadata=metadata))

        # Add documents to retriever's vectorstore and docstore
        retriever.vectorstore.add_documents(job_documents)
        retriever.docstore.mset(list(zip(job_doc_ids, job_elements)))

In [None]:
len(json_data)

269

In [None]:
from tqdm import tqdm
 # Ensure json_data is defined as a list
json_data_sample = json_data[:150]
add_documents_to_retriever(retriever, job_elements=json_data_sample)

In [None]:
from langchain.embeddings import JinaEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
embedding_model = JinaEmbeddings(
    jina_api_key="jina_a336488c98c6463180a2494f62739952boWqK6nYZSwFNd-Z6JJJ-Gvs1sqW",  # Replace with your API key
    model_name="jina-embeddings-v3"
)
cv_embedding = embedding_model.embed_query(test2)

In [None]:
test=json_data[100]['Job_txt']

job_embedding = embedding_model.embed_query(test)
# Compute similarity
similarity_score = cosine_similarity([job_embedding], [cv_embedding])[0][0]

print(f"Similarity Score: {similarity_score:.2f}")

Similarity Score: 0.64


In [None]:
#loop through retriever and calculate similarity to each data in it with the CV
def get_matching_jobs(cv_embedding, retriever):
    matching_jobs = []
    for doc in retriever.vectorstore._collection.get().values():
        job_embedding = doc.embedding
        similarity_score = cosine_similarity([job_embedding], [cv_embedding])[0][0]
        matching_jobs.append((doc.metadata, similarity_score))
    return matching_jobs

In [None]:
retriever.vectorstore._collection.get()

In [None]:
json_data[0]

{'Job_ID': 4041424674,
 'Job_txt': 'Internship for Webmaster (M/F) (Stage pour Webmaster (H/F))\nORIENTAL GROUP / Marrakech Casablanca, Casablanca-Settat, Morocco\n1 month ago 124 applicants\nSee who ORIENTAL GROUP / Marrakech has hired for this role\nApply\nJoin or sign in to find your next job\nJoin to apply for the Internship for Webmaster (M/F) (Stage pour Webmaster (H/F)) role at ORIENTAL GROUP / Marrakech\nNot you?\nRemove photo\nFirst name\nLast name\nEmail\nPassword (6+ characters)\nBy clicking Agree & Join, you agree to the LinkedIn User Agreement, Privacy Policy and Cookie Policy.\nContinue Agree & Join\nor\nApply on company website\nSecurity verification\nAlready on LinkedIn? Sign in\nSave\nSave job\nSave this job with your existing LinkedIn profile, or create a new one.\nYour job seeking activity is only visible to you.\nEmail\nContinue\nWelcome back\nSign in to save Internship for Webmaster (M/F) (Stage pour Webmaster (H/F)) at ORIENTAL GROUP / Marrakech.\nEmail or phone\n

In [None]:
import uuid
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import JinaEmbeddings
from langchain.schema.document import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.prompts import PromptTemplate

# Enhanced CV Analysis Prompt
prompt_cv = PromptTemplate.from_template("""
You are an expert CV analyst. Extract and structure key information from this CV for optimal job matching:

1. **Professional Summary**: [3-sentence career overview with quantifiable achievements]
2. **Core Competencies**:
   - Technical: [Programming languages, frameworks, tools]
   - Soft Skills: [Leadership, communication, project management]
3. **Experience**:
   - For each role:
     - Title: [Job title]
     - Company: [Employer]
     - Duration: [Dates]
     - Achievements: [Quantified impact statements]
4. **Education**: [Degrees, institutions, relevant coursework]
5. **Certifications**: [Technical certifications with issuing bodies]
6. **Key Projects**: [3-5 projects with technologies used and business impact]
7. **Industry Keywords**: [15-20 technical terms and domain-specific concepts]

Format response in Markdown with clear section headers. Focus on concrete, measurable details.

CV Content:
{content}
""")

# Enhanced Vector Store Setup
def create_enhanced_vectorstore(job_data, embedding_model, persist_dir="job_chroma_db"):
    """Create optimized Chroma vectorstore with metadata indexing"""
    documents = []
    metadatas = []

    for job in job_data:
        doc = Document(
            page_content=job["Job_txt"],
            metadata={
                "job_id": str(uuid.uuid4()),
                "title": job["job-title"],
                "company": job.get("company", ""),
                "location": job.get("location", ""),
                "industry": job.get("industry", ""),
                "posted_date": job.get("posted-date", "")
            }
        )
        documents.append(doc)
        metadatas.append(doc.metadata)

    return Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_dir,
        collection_metadata={"hnsw:space": "cosine"}  # Optimized similarity metric
    )

# Enhanced Retrieval System
def create_retrieval_chain(vectorstore, llm):
    """Create optimized retrieval chain with contextual compression"""
    base_retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 30}  # Retrieve more for reranking
    )

    # Contextual compression for better relevance
    compressor = LLMChainExtractor.from_llm(llm)
    return ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=base_retriever,
        search_kwargs={"k": 20}  # Final number of results
    )

# Job Matching Pipeline
def job_matching_pipeline(cv_text, retriever, embedding_model):
    """End-to-end matching process"""
    # Generate enriched CV summary
    cv_summary = Summarize_job_text(cv_text, prompt_cv)

    # Semantic embedding with batch processing
    cv_embedding = embedding_model.embed_query(cv_summary)

    # Hybrid search with metadata filtering
    results = retriever.get_relevant_documents(
        cv_summary
    )

    return process_results(results)

def process_results(results):
    """Enrich and format results with similarity scores"""
    processed = []
    for doc in results:
        processed.append({
            "title": doc.metadata["title"],
            "company": doc.metadata["company"],
            "location": doc.metadata["location"],
            "match_score": doc.metadata.get("similarity_score", 0),
            "summary": doc.page_content[:500] + "...",
            "job_id": doc.metadata["job_id"]
        })
    return sorted(processed, key=lambda x: x["match_score"], reverse=True)[:20]



In [None]:
# Usage Example

    # Initialize models
embedding_model = JinaEmbeddings(
    jina_api_key="jina_a336488c98c6463180a2494f62739952boWqK6nYZSwFNd-Z6JJJ-Gvs1sqW",
    model_name="jina-embeddings-v3"
)

# Load data
jobs_df = pd.read_csv("/content/Jobs_data_new_python.csv").dropna()

# Create vectorstore
vectorstore = create_enhanced_vectorstore(
    jobs_df.to_dict("records"),
    embedding_model
)



In [None]:
!pip install langchain-groq

In [None]:
!pip install langchain-community

In [None]:
from langchain_groq import ChatGroq
# Initialize Groq LLM
GROQ_API_KEY = "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"
llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model_name="llama3-8b-8192",
    temperature=0.3,
    max_tokens=4000
)

In [None]:
    # Create retrieval chain
retriever = create_retrieval_chain(vectorstore, llm)

# Process CV
recommendations = job_matching_pipeline(cv_text, retriever, embedding_model)

In [None]:
recommendations

[{'title': 'C# Developer',
  'company': 'YO IT CONSULTING',
  'location': 'Morocco',
  'match_score': 0,
  'summary': 'NO_OUTPUT....',
  'job_id': 'fb132617-c0bc-4a81-887f-dc4a18d87e09'},
 {'title': 'Python Full Stack Developer',
  'company': 'YO IT CONSULTING',
  'location': 'Morocco',
  'match_score': 0,
  'summary': 'NO_OUTPUT\n\nThe provided context does not contain any relevant information to answer the question. The question appears to be asking about a job posting for a Python Full Stack Developer position, but the provided context is a CV of an individual with expertise in AI development, machine learning, and natural language processing....',
  'job_id': '19123c92-60b2-44df-a057-8495ce13b8ac'},
 {'title': 'Développeur Backend - PHP',
  'company': 'Elavi Agency',
  'location': 'Casablanca, Casablanca-Settat, Morocco',
  'match_score': 0,
  'summary': 'NO_OUTPUT\n\nThe provided context does not seem to be relevant to the question. The question is about a CV, and the context appe