# 1. Library Installation

In [None]:

import os
import json
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document



# 2. Define paths

In [None]:
PROJECT_ROOT = "Fake_Job_Posting_Detection"

RAW_DATA_PATH = os.path.join(
    PROJECT_ROOT, "data", "raw", "fake_job_postings.csv"
)

RAG_STORE_PATH = os.path.join(PROJECT_ROOT, "rag_store")
FAISS_INDEX_PATH = os.path.join(RAG_STORE_PATH, "faiss_index")
EMBEDDING_CONFIG_PATH = os.path.join(RAG_STORE_PATH, "embedding_config.json")

os.makedirs(RAG_STORE_PATH, exist_ok=True)


# 3. Load RAW dataset

In [None]:
df_raw = pd.read_csv(RAW_DATA_PATH)

required_cols = [
    "title", "company_profile", "description", "requirements", "fraudulent"
]
assert all(col in df_raw.columns for col in required_cols)

print(f"Raw dataset loaded: {df_raw.shape}")


Raw dataset loaded: (17880, 18)


# 4. Filter legitimate job postings ONLY
**Design rule:**
RAG explanations must be grounded in verified real jobs

In [None]:
real_jobs_df = (
    df_raw[df_raw["fraudulent"] == 0]
    .sample(n=300, random_state=42)
    .reset_index(drop=True)
)

print(f"Real jobs selected: {len(real_jobs_df)}")


Real jobs selected: 300


# 5. Hugging Face embedding model

In [None]:
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

embedding_function = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME
)

print(f"Embedding model configured: {EMBEDDING_MODEL_NAME}")



  embedding_function = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model configured: sentence-transformers/all-MiniLM-L6-v2


# 6. Document construction

In [None]:
def build_job_document(row):
    return (
        f"Title: {row['title']}. "
        f"Company Profile: {row['company_profile']}. "
        f"Job Description: {row['description']}. "
        f"Requirements: {row['requirements']}."
    )


# 7. Create LangChain documents

In [None]:
documents = [
    Document(
        page_content=build_job_document(row),
        metadata={"title": row["title"]}
    )
    for _, row in real_jobs_df.iterrows()
]

print(f"Documents created: {len(documents)}")


Documents created: 300


# 8. Build & persist FAISS index

In [None]:
faiss_store = FAISS.from_documents(documents, embedding_function)

faiss_store.save_local(FAISS_INDEX_PATH)

print("FAISS index saved successfully")


FAISS index saved successfully


# 9. Save embedding metadata

In [None]:
embedding_config = {
    "embedding_model": EMBEDDING_MODEL_NAME,
    "num_documents": len(documents),
    "data_source": "raw_fake_job_postings.csv",
    "document_policy": "real_jobs_only"
}

with open(EMBEDDING_CONFIG_PATH, "w") as f:
    json.dump(embedding_config, f, indent=4)

print("Embedding configuration saved")


Embedding configuration saved


In [None]:
assert os.path.exists(FAISS_INDEX_PATH)
assert os.path.exists(EMBEDDING_CONFIG_PATH)

print("RAG artifacts built successfully")

RAG artifacts built successfully
