# 1. Setup, Library Installation, and Data Preparation
We have load the necessary libraries and segment the dataset to create our knowledge base of real jobs.

In [None]:
!pip install -U langchain langchain-community sentence-transformers faiss-cpu -qq


In [None]:


import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document


In [None]:
# 1. Load the raw dataset
df = pd.read_csv('/content/drive/MyDrive/Fake_Job_Posting_Detection/data/raw/fake_job_postings.csv')
df.fillna('', inplace=True)
df.head(3)

In [None]:
# 2. Setup Embedding Model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
embedding_function = SentenceTransformerEmbeddings(model_name=model_name)

print(f"Sentence Transformer Model ({model_name}) loaded.")

In [None]:
# 3. Create the Knowledge Base Source (Real Job Postings)
# Filter for verified Real Job Postings (fraudulent == 0)
real_jobs_df = df[df['fraudulent'] == 0].sample(n=300, random_state=42)
# Sampling 300 posts as requested for a focused knowledge base.

print(f"Knowledge Base Source created with {len(real_jobs_df)} verified real job postings.")

# 2. Build the Vector Store (FAISS)
We have embed the text of the real job postings and store them in FAISS for fast retrieval.

In [None]:
# Embed Real Postings and Build FAISS Vector Store

# Function to combine relevant text fields for the knowledge base
def combine_job_text(row):
    return (f"Title: {row['title']}. Company: {row['company_profile']}. "
            f"Description: {row['description']}. Requirements: {row['requirements']}")

# 1. Prepare Documents
real_job_documents = []
for index, row in real_jobs_df.iterrows():
    real_job_documents.append(
        Document(
            page_content=combine_job_text(row),
            metadata={"job_id": row['job_id'], "title": row['title']}
        )
    )



In [None]:
# 2. Create FAISS Vector Store
# This process embeds the text content of all 300 real jobs.
real_job_kb = FAISS.from_documents(real_job_documents, embedding_function)
print(f"FAISS Vector Store built with embeddings from {len(real_job_documents)} real job postings.")

# 3. Implement the Advanced Retrieval Function
This function retrieves the most similar real job postings and uses them to explain why a suspicious post differs from the norm.

In [None]:
#  Retrieval and Generation Function (Similarity-Based Explanation)

def retrieve_and_explain_similarity(suspicious_job_text, real_job_kb, top_k=3):
    """Retrieves similar real jobs to contrast with the suspicious job."""

    # 1. Retrieval: Find top_k most similar (legitimate) job postings
    retrieved_docs = real_job_kb.similarity_search(suspicious_job_text, k=top_k)

    # 2. Construct the Prompt (Simulated LLM)
    context_list = []
    for i, doc in enumerate(retrieved_docs):
        # Limit content length for cleaner output
        content_snippet = doc.page_content[:150].replace('\n', ' ') + '...'
        context_list.append(f"Example {i+1} (Title: {doc.metadata['title']}): {content_snippet}")

    context_str = "\n".join(context_list)

    # 3. Generation (Simulated for this project)
    # The actual LLM prompt would ask for a contrastive explanation:

    simulated_explanation = (
        "**Conclusion: The job is suspicious because it deviates significantly "
        "from typical, real job postings for similar roles.**\n\n"
        "**Analysis based on top 3 closest legitimate examples:**\n"
        "The following real job postings were retrieved as highly similar to the suspicious posting:\n\n"
        f"{context_str}\n\n"
        "**Key Differences (LLM Explanation):**\n"
        "While the retrieved examples feature detailed company profiles, specific contact details, "
        "and balanced language, the suspicious posting is likely missing one or more of these elements. "
        "This type of explanation confirms the suspicious nature by showing it is an 'outlier' "
        "compared to verified postings."
    )

    return simulated_explanation

# 4. Demonstration with a New Posting
We have demonstrate this powerful RAG system using a known fake job.

In [None]:
#  RAG Demonstration (Using a Known Fake Job)

# Get a known fake job posting
fake_post = df[df['fraudulent'] == 1].iloc[0]

# Combine key text fields for the query to the vector store
job_text_query = combine_job_text(fake_post)

print(f"--- FAKE JOB POSTING (Actual Label: 1) ---")
print(f"Title: {fake_post['title']}\nDescription Snippet: {fake_post['description'][:200]}...")
print("\n" + "="*50 + "\n")

# Run Advanced RAG
explanation = retrieve_and_explain_similarity(job_text_query, real_job_kb)

print("--- RAG-GENERATED EXPLANATION (Similarity-Based) ---")
print(explanation)

# 5. pushing to github

In [None]:
%cd /content/drive/MyDrive/Fake_Job_Posting_Detection


In [None]:
!ls -a


In [None]:
!git status


In [None]:
!ls -R


In [None]:
!git add notebooks/04_RAG_Implementation.ipynb

In [None]:
!git config --global user.email "muhammadriaz8685@gmail.com"
!git config --global user.name "mriaz72"

In [None]:
!git commit -m " Implemented advanced RAG using real job postings as context for similarity-based explanations."

In [None]:
!git push

In [None]:
!git push --force origin main