# Implementation of a pipeline converting relational models to vector embeddings in a vector database

## Imports

In [None]:
!pip install chromadb langchain llama-index

In [None]:
# Required for reading word and pdf files
!pip install docx2txt
!pip install pypdf

In [None]:
from dataclasses import dataclass, asdict
import uuid
import datetime
import random
import json

import os
import glob
from google.colab import drive

import chromadb
# from sentence_transformers import SentenceTransformer

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, load_index_from_storage
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.embeddings.langchain import LangchainEmbedding

from llama_index.embeddings import HuggingFaceEmbedding
from IPython.display import Markdown, display


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## Utilities

### Models

In [None]:
# Simple model classes for a company/organization
@dataclass
class Company:
  """ Class for keeping track of basic company information """
  id: str
  name: str
  date_founded: str
  mission_statement: str
  vision: str
  company_culture_statement: str
  address: str

@dataclass
class JobDescription:
  """ Class for keeping track of Job description information defined for a specific company"""
  id: str
  company_id: str # Not required for this model, a saas/paas solution may require this
  job_title: str
  job_description_file: str

@dataclass
class JobPost:
  """ Class for keeping track of Job description information defined for jobs"""
  id: str
  created_date: str
  title: str
  job_description_id: str
  active: bool
  salary_range: str


@dataclass
class JobApplication:
  """ Class for keeping track of Job description information defined for jobs"""
  id: str
  candidate_name: str
  candidate_email: str
  created_date: str
  job_post_id: str
  resume_link: str
  active: bool


### Functions

In [None]:
# Default start and end dates
env_start_date = datetime.datetime(2023, 1, 1)
env_end_date = datetime.datetime(2023, 12, 31)


def generate_random_date(start_date, end_date):
  """ Generates a random date within the specified start and end_date """
  # Get today's date
  today = datetime.datetime.today()

  # Calculate the date difference and generate a random number of days within the date range
  date_diff = end_date - start_date
  random_days = datetime.timedelta(days=random.randrange(date_diff.days))

  # Generate the random date
  random_date = start_date + random_days
  return random_date.strftime("%Y-%m-%d")


def get_files(file_path, file_type = "*.pdf"):
    """
    Get all PDF files from the specified file path.

    Args:
        file_path (str): The directory path containing the PDF files.

    Returns:
        list: A list containing the paths of all the PDF files in the directory.
    """
    if os.path.exists(file_path):
        return glob.glob(os.path.join(file_path, file_type))
    else:
        return []

def extract_filename(filepath):
    # Split the path by '/' and get the last element
    filename = filepath.split('/')[-1]

    # Remove the extension if present
    if '.' in filename:
        filename = filename.split('.')[0]

    return filename

### Data Synthesis: Deprecated (New datasource from data in DB)


In [None]:
# Generate synthesised data

# One company profile
# Generate 2 job descriptions
# Generate 2 job posts
# Generate 10 job applications

# Path to docx documents containing job descriptions
job_description_path="/content/drive/MyDrive/LLM_Training/dataset/job_description/test_case_1"

# Path to docx documents containing job descriptions
application_resume_path="/content/drive/MyDrive/LLM_Training/dataset/resume/test_case_1"

company = Company(
    id=uuid.uuid4().hex,
    name="Company One",
    date_founded="1998-09-04",
    mission_statement="To organize the world's information and make it universally accessible and useful",
    vision="To provide access to the world's information in one click",
    company_culture_statement="A problem isn't truly solved until it's solved for all. Googlers build products that help create opportunities for everyone, whether down the street or across the globe. Bring your insight, imagination and a healthy disregard for the impossible. Bring everything that makes you unique. Together, we can build for everyone",
    address="Mountain View, California, USA",
)

salary_ranges = ["$60,000 - $80,000", "$80,000 - $100,000", "$100,000 - $120,000", "$120,000 - $140,000", "$140,000+"]

job_title_map = {
    "java_developer": "Java Developer",
    "senior_devops_engineer": "Senior Devops Engineer",
    "senior_management_accountant": "Senior Management Consultant",
}

resume_applicant_map = {
    "accountant_1": "David Doe",
    "it_professional_1": "John Black",
    "oluwatobi_alao": "Oluwatobi Alao",
}

In [None]:
# Generate Job Descriptions and Job Posts

job_description_files = get_files(job_description_path, "*.docx")

dep_job_descriptions = []
dep_job_posts = []

for index, jd in enumerate(job_description_files):
  job_title = job_title_map[extract_filename(jd)]
  job_description = JobDescription(
      id=uuid.uuid4().hex,
      company_id=company.id,
      job_title=job_title if job_title else f"Job {index}",
      job_descrption_file=jd)

  job_post = JobPost(
      id=index,
      created_date=generate_random_date(env_start_date, env_end_date),
      job_descrption_id=job_description.id,
      title=job_title if job_title else f"Job {index}",
      active=True,
      salary_range=random.choice(salary_ranges)
      )

  dep_job_descriptions.append(job_description)
  dep_job_posts.append(job_post)


resume_files = get_files(application_resume_path, "*.pdf")
dep_job_applications = []
for index,resume in enumerate(resume_files):
  dep_job_applications.append(
      JobApplication(
          id=uuid.uuid4().hex,
          created_date=generate_random_date(env_start_date, env_end_date),
          candidate_name=resume_applicant_map[extract_filename(resume)],
          candidate_email=f"candidate_{index}@test.com",
          job_post_id=index,
          resume_link=resume
      )
  )

In [None]:
# Save synthesised data to json file

# data = {
#     "company": asdict(company),
#     "job_descriptions": list(map(lambda x: asdict(x), job_descriptions)),
#     "job_posts": list(map(lambda x: asdict(x), job_posts)),
#     "job_applications": list(map(lambda x: asdict(x), job_applications))
# }

# with open('/content/drive/MyDrive/LLM_Training/dataset/company-data.json', 'w') as f:
  # json.dump(data, f, indent=2)
  # f.close()



## Data Operations: Vectorization and Persistence

### Vector DB: Utilities

In [None]:
def get_collection_and_vector_store(db, collection_name: str) -> tuple:
  collection = db.get_or_create_collection(collection_name)
  vec_store = ChromaVectorStore(chroma_collection=collection)
  return collection, vec_store

def add_document_to_vector_store(store_index: VectorStoreIndex, file_path: str, entity_ref: str, ref_name: str):
  """Function to create a document with required metadata and storing in vectore store with embeddings"""
  document = SimpleDirectoryReader(input_files=[file_path]).load_data()
  document[0].metadata['parent_obj_ref'] = entity_ref
  document[0].metadata['ref_name'] = ref_name
  store_index.insert(document[0])

def clean_vector_db(vector_db_index, collection):
  for id in list(map(lambda a: a['ref_doc_id'],collection.get()["metadatas"])):
    vector_db_index.delete_ref_doc(id)

def get_index(vector_store, service_context):
  return VectorStoreIndex.from_vector_store(vector_store,service_context=service_context)

def showCollectionData(collection):
  print(list(map(lambda a: a['ref_doc_id'], collection.get()["metadatas"])))

### Vector DB Initialization

In [None]:
# os.environ["OPENAI_API_KEY"] = "random"
# del os.environ["OPENAI_API_KEY"]

In [None]:
#Chroma DB Client Initialization

vector_db_path="/content/drive/MyDrive/LLM_Training/vector_db"

# initialize client, setting path to save data
db = chromadb.PersistentClient(path=vector_db_path)


# define embedding function
embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2')

service_context = ServiceContext.from_defaults(
    embed_model=embed_model, llm = None
)

# assign chroma as the vector_store to the context
jd_collection, jd_vector_store = get_collection_and_vector_store(db, "job_description")
resume_collection, resume_vector_store = get_collection_and_vector_store(db, "resume")

storage_context = StorageContext.from_defaults(vector_stores={
    'job_description': jd_vector_store,
    'resume': resume_vector_store
})

# load your index from stored vectors
# index = VectorStoreIndex.from_vector_store(
#     storage_context=storage_context,
#     service_context=service_context
# )

jd_index = get_index(jd_vector_store, service_context)
resume_index = get_index(resume_vector_store, service_context)

LLM is explicitly disabled. Using MockLLM.


In [None]:
showCollectionData(jd_collection)
showCollectionData(resume_collection)

# clean_vector_db(jd_index, jd_collection)

### Data Entry Simulation (Job Description and Job Application)

In [None]:

job_descriptions = [
    {
      "id": "69453f3a-1df8-4520-9eea-8a98172593ad",
      "company_id": "845bfe0e-538d-4c38-897e-3b30b0d3458b",
      "job_title": "Senior Management Consultant",
      "job_description_file": "/content/drive/MyDrive/LLM_Training/dataset/job_description/test_case_1/senior_management_accountant.docx",
      "created_date": "2023-12-05T12:55:55.789+00:00"
    },
    {
      "id": "a52fd3b9-e6e8-4956-b716-df8fe9e307ca",
      "company_id": "845bfe0e-538d-4c38-897e-3b30b0d3458b",
      "job_title": "Senior Devops Engineer",
      "job_description_file": "/content/drive/MyDrive/LLM_Training/dataset/job_description/test_case_1/senior_devops_engineer.docx",
      "created_date": "2023-12-05T12:56:30.309+00:00"
    },
    {
      "id": "cbb45246-6225-42fb-910d-cba3786fc164",
      "company_id": "845bfe0e-538d-4c38-897e-3b30b0d3458b",
      "job_title": "Java Developer",
      "job_description_file": "/content/drive/MyDrive/LLM_Training/dataset/job_description/test_case_1/java_developer.docx",
      "created_date": "2023-12-05T12:56:55.629+00:00"
    }
]

# Add job description document to vector store
for jd in job_descriptions:
  doc = add_document_to_vector_store(
      jd_index,
      file_path=jd['job_description_file'],
      ref_name=jd['job_title'],
      entity_ref=jd['id'])


In [None]:

job_applications = [
    {
      "id": "516273bb-ec25-4cf2-8fb6-582cca99d9c9",
      "candidate_email": "candidate_0@test.com",
      "candidate_name": "David Doe",
      "job_post_id": "065a1780-b015-496d-8cc1-5bb98a91f987",
      "job_post_title": "Senior Management Consultant",
      "active": True,
      "created_date": "2023-10-23T00:00:00.000+00:00",
      "resume_link": "/content/drive/MyDrive/LLM_Training/dataset/resume/test_case_1/accountant_1.pdf"
    },
    {
      "id": "c8d6d3ee-da5d-4d57-a3c6-18f5ca20737c",
      "candidate_email": "candidate_2@test.com",
      "candidate_name": "Oluwatobi Alao",
      "job_post_id": "897e04c5-42d7-4d4b-840a-335b1bf3a4e5",
      "job_post_title": "Java Developer",
      "active": True,
      "created_date": "2023-08-22T00:00:00.000+00:00",
      "resume_link": "/content/drive/MyDrive/LLM_Training/dataset/resume/test_case_1/oluwatobi_alao.pdf"
    }
]

# Add job description document to vector store
for application in job_applications:
  doc = add_document_to_vector_store(
      resume_index,
      file_path=application['resume_link'],
      ref_name=f"{application['job_post_id']}_{application['candidate_name']}",
      entity_ref=application['id'])