#### #Resume Scanner (Local only)

In [None]:
import os
import shutil
import torch
import langchain
import transformers

from InstructorEmbedding import INSTRUCTOR

from langchain import hub

from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPowerPointLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

from langchain.vectorstores import Chroma

from chromadb.config import Settings

from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import GenerationConfig, pipeline
from transformers import AutoModelForQuestionAnswering

from torch import mps, cuda, bfloat16

from IPython.display import display, Markdown

In [None]:
# Cleanup Guidance after occassional experiements -- last checked 11/24/2023

imported_modules = ['os', 'shutil', 'torch', 'langchain', 'transformers', 'INSTRUCTOR', 'hub', 'CSVLoader', 'PDFMinerLoader', 'TextLoader', 'UnstructuredExcelLoader', 'Docx2txtLoader', 'UnstructuredFileLoader', 'UnstructuredMarkdownLoader', 'PyPDFLoader', 'UnstructuredPowerPointLoader', 'RecursiveCharacterTextSplitter', 'HuggingFaceInstructEmbeddings', 'HuggingFacePipeline', 'load_qa_chain', 'RetrievalQA', 'PromptTemplate', 'Chroma', 'Settings', 'AutoConfig', 'AutoModelForSeq2SeqLM', 'AutoModelForCausalLM', 'AutoTokenizer', 'GenerationConfig', 'pipeline', 'AutoModelForQuestionAnswering', 'mps', 'cuda', 'bfloat16', 'display', 'Markdown']

all_names = dir()

unused_modules = [module for module in imported_modules if module not in all_names]

print("Unused modules/functions:", unused_modules)


In [None]:
# Constants

#Local Folder and Database
ROOT_DIRECTORY = "C:/Users/nilan/Documents/GenAI/GitHub/MyProjects/Doc-Reader"
# Document Folder
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
# Database Folder
PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"

# Output directory and File
SUMMARY_DIRECTORY = f"{ROOT_DIRECTORY}/SUMMARIES"
SUMMARY_DOCUMENT = "summaries.txt"

# Local Models
MODELS_DIRECTORY = "C:/Users/nilan/Documents/GenAI/Tools/text-generation/text-generation-webui-310/models"

# Context Window and Max New Tokens
# The context window of large language models (LLMs) refers to the range of tokens the model can consider when generating 
# responses to prompts. It determines how far back in the input sequence the model looks to understand context 
# and make predictions. LLMs with larger context windows can consider more preceding tokens, which can be beneficial 
# for tasks that require long-range dependencies or understanding complex contexts.
CONTEXT_WINDOW_SIZE = 4096

# MAX_NEW_TOKENS specifies the maximum number of tokens to generate in the output sequence, ignoring the number of tokens in 
# the input prompt. In other words, it determines the additional tokens beyond the prompt that the model generates. 
# For example, if you set max_new_tokens to 50, the model will generate up to 50 new tokens after the input prompt, 
# regardless of how long the prompt itself is.
MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)

## In case of "not enough space in the buffer" error, reduce the values below. 
## Start with half of the original values and keep halving the value until the error stops appearing
# n_gpu_layers option in llama.cpp allows you to specify the number of transformer layers that should be offloaded to the GPU 
# during inference. By doing so, you can accelerate the computation by leveraging the parallel processing capabilities of the GPU
N_GPU_LAYERS = 100  # Llama-2-70B has 83 layers

# n_batch determines the number of prompt tokens processed in parallel during inference. Don't mess around with it unless you face errors
N_BATCH = 512

# Threads refer to virtual slices of the workload your operating system maps to your CPU cores. The number of threads you configure 
# affects how efficiently your CPU cores are utilized during inference. Recommended value: your number of physical cores. Useless for GPU-only inference.
THREADS = os.cpu_count()

# Threads_batch is the number of threads allocated for batch processing. Optimizing the number of threads is crucial for performance. 
# The optimal thread count depends on your specific hardware configuration (number of CPU cores, hyperthreading, etc.).
# Recommended value: total number of CPU cores (physical + virtual). Useless for GPU-only inference.
THREADS_BATCH = os.cpu_count()*2

# Temperature is a hyperparameter that regulates the randomness, or creativity, of the AI’s responses.
# A higher temperature value typically makes the output more diverse and creative but might also 
# increase its likelihood of straying from the context.

TEMPERATURE = 0.01      # do_samples=True to enable Sample Decoding - Top-K / Top-P sampling etc.

# Top-p, also known as nucleus sampling, controls the cumulative probability of the generated tokens. 
# The model generates tokens until the cumulative probability exceeds the chosen threshold (p). 
TOP_P = 0.9

# Top-k provides a controlled randomness by considering a fixed number of top probable tokens.
TOP_K = 50

# Instructor Model
# Add 2-5GB VRAM generally for embedding models.
EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Great-ish accuracy)

# Loaders 
DOCUMENT_MAP = {
    ".txt": TextLoader,
    ".md": UnstructuredMarkdownLoader,
    ".py": TextLoader,
    ".pdf": PyPDFLoader,
    ".csv": CSVLoader,
    ".xls": UnstructuredExcelLoader,
    ".xlsx": UnstructuredExcelLoader,
    ".docx": Docx2txtLoader,
    ".doc": Docx2txtLoader,
    ".ppt": UnstructuredPowerPointLoader,
    ".pptx": UnstructuredPowerPointLoader
}

# Chroma settings
CHROMA_SETTINGS = Settings(
    anonymized_telemetry=False,
    is_persistent=True,
)

In [None]:
# Settings

device_type = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'mps' 

# Multi-Process Service (MPS) is a client-server implementation of the CUDA API for running multiple 
# processes concurrently on the same GPU. 
show_sources = True
use_history = True
save_qa = False

# Model
## (Full/Base MODELS)
model_id = "meta-llama/Llama-2-7b-chat-hf"
model_basename = "Llama-2-7b-chat-hf"
model_type = "llama"
#model_id = "mistralai/Mistral-7B-Instruct-v0.1"
#model_basename = "Mistral-7B-Instruct-v0.1"
#model_type= "mistral"

In [None]:
# Utilitarian Functions

# Remove '\n' character sequences from the documents
def remove_newline_characters(data):
    if isinstance(data, dict):
        return {key: remove_newline_characters(val) for key, val in data.items()}
    elif isinstance(data, str):
        return data.replace('\n', '')
    elif isinstance(data, list):
        return [remove_newline_characters(item) for item in data]
    else:
        return data

# Clear Chroma directory of old collections
def clear_directory(directory):
    # delete all files and folders in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

# Write Resume Summaries to a file in append mode
def write_summary(question, result, summary_directory, summary_document):
    # Define the summary file path
    summary_file_path = os.path.join(summary_directory, summary_document).replace("\\","/")

    # Open the file in append mode ('a')
    with open(summary_file_path, 'a') as f:
        # Write the result to the file
        f.write(str(result) + '\n')

<div class="alert alert-block alert-danger">
<b>Enter the Resume File Name with Extension in the cell output below:</b> 
</div>

In [None]:
# Ask for the document to process

while True:
    # Ask user to input a file name
    file_name = input("Please enter a file name: ")

    # Validate the file name
    if not os.path.splitext(file_name)[1]:
        print("Error: File name must have an extension.")
    elif not os.path.exists(os.path.join(SOURCE_DIRECTORY, file_name)):
        print("Error: File does not exist in the source directory. Please try again.")
    else:
        print(f"File {file_name} is valid and exists in the source directory.")
        break  # exit the loop if the file is valid and exists
        
# file_name = "Duhamel, Jeff CV updated 21-Nov-2023 EN-GB.pdf"

In [None]:
# System Check

print(f"> CUDA Found: {cuda.is_available()}\n"
      f"> CUDA version: {torch.version.cuda}\n"
      f"> Chosen Model Type: {model_type}\n"
      f"> LLM will run on: {device_type}\n"
      f"> Display Source Documents set to: {show_sources}\n"
      f"> Use history set to: {use_history}")

# check if the persist_directory is empty before calling the function to clear it
if not os.listdir(PERSIST_DIRECTORY):
    print(f"{PERSIST_DIRECTORY} is already empty")
else:
    clear_directory(PERSIST_DIRECTORY)

In [None]:
# Set up LLM paths for local inference

model_dir = model_id.replace("/","_")
model_path = os.path.join(MODELS_DIRECTORY, model_dir).replace("\\","/")
cache_dir = f"{model_path}/cache"

print(f"> Local Model Directory: {model_dir}\n> Local Model Path: {model_path}\n> Local Cache Directory: {cache_dir}")

<div class="alert alert-block alert-danger">
Skip the cell below if LLM has already been loaded and ipykernel is hot
</div>

In [None]:
# Load a full Model (e.g., meta-llama/Llama-2-7b-chat-hf) 

"""
text = "<s>[INST] What is your favourite band? [/INST]"
"Well, I'm quite partial to Mazzy Star. There are very few voices in music that strike you like the gentleness of Hope Sandoval. As the frontwoman of the long-revered and much-talked-about Mazzy Star, Sandoval mixed her tender falsettos with David Roback's jangly guitars to deliver heartfelt tunes that range from intimate folk to experimental psychedelia.</s> "
"[INST] Tell me more about Mazzy Star and bands like Mazzy star in less than 100 words? [/INST]"
"""

model_config = transformers.AutoConfig.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    use_flash_attention_2=False,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    config=model_config,
    low_cpu_mem_usage=True,
    cache_dir=cache_dir,
    trust_remote_code=True, # set these if you are using NVIDIA GPU
    #load_in_4bit=True,
    #bnb_4bit_quant_type="nf4",
    #bnb_4bit_compute_dtype=torch.float16,
    max_memory={0: "16GB"},  # for multi-GPU system {0: "16GB", 1: "6GB"} etc.  
    local_files_only=True
    )

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, local_files_only=True)

In [None]:
# Test out the LLM inference

messages = [
    {"role": "user", "content": "Tell me more about Mazzy Star and bands like Mazzy star in less than 100 words?"},
    {"role": "assistant", "content": "Well, I'm quite partial to Mazzy Star. There are very few voices in music that strike you like the gentleness of Hope Sandoval. As the frontwoman of the long-revered and much-talked-about Mazzy Star, Sandoval mixed her tender falsettos with David Roback's jangly guitars to deliver heartfelt tunes that range from intimate folk to experimental psychedelia."},
    {"role": "user", "content": "Tell me more about Mazzy Star and bands like Mazzy star in less than 100 words?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device_type)

model.eval()
print(f"\n> Loaded Local full model: {model}, tokenizer: {tokenizer}, on: {device_type}")

generated_ids = model.generate(model_inputs, max_new_tokens=1024, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

In [None]:
# Print the question and answer from Llama 2 LLM inference above
# ** may not work with other models that expect different format of prompts

for item in decoded:
    # Find start and end index of "instruction" part
    inst_start = item.find("[INST]") + 7
    inst_end = item.find("[/INST]")

    # Extract instruction
    instruction = item[inst_start:inst_end].strip()

    # Extract completion
    last_occurrence = max((i for i, char in enumerate(item) if char == ']'), default=None)
    if last_occurrence is None:
        raise ValueError("No matching ] found.")
    else:
        completion = item[(last_occurrence + 3):].strip().replace("</s>", "")

    # Print results
    print(f"\n>Prompt:\n{instruction}\n\n>Completion:\n{completion}")

#### Load the Resume to Process

In [None]:
# Load the resume to process and split it in chunks

try:
    source_file_path = os.path.join(SOURCE_DIRECTORY, file_name).replace("\\","/")
    file_extension = os.path.splitext(source_file_path)[1]
    loader_class = DOCUMENT_MAP.get(file_extension)

    if loader_class:
        loader = loader_class(source_file_path)
        doc_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        documents = loader.load()
        docs = doc_splitter.split_documents(documents)
        docs = [remove_newline_characters(doc) for doc in docs]

        print(f"\n> Loader selected: {loader_class}"
              f"\n> Loading {os.path.basename(source_file_path)} from {source_file_path}"
              f"\n> Pages parsed: {len(documents)}"
              f"\n> Split into {len(docs)} chunks of text"
              f"\n> First chunk: {docs[0]}")
    else:
        print(f"\n> {source_file_path} document type is undefined.")
except Exception as e:
    print(f"\n> Loading error: {str(e)} -> {source_file_path}")


In [None]:
# Create embeddings

embeddings = HuggingFaceInstructEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    embed_instruction="Represent the query for retrieval: ",
    model_kwargs={"device": device_type},
    encode_kwargs = {'normalize_embeddings': True}
    )

# change the embedding type if you are running into issues.
# These are much smaller embeddings and will work for most appications
# If you use HuggingFaceEmbeddings, make sure to also use the same during retrieval

# embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


In [None]:
# Add the documents to the database

ids = [str(i) for i in range(1, len(docs) + 1)]

db = Chroma.from_documents(docs, embeddings, ids=ids, client_settings=CHROMA_SETTINGS, persist_directory=PERSIST_DIRECTORY)

print(f"\n> Collection Count: {db._collection.count()}\n> {db.embeddings}")


In [None]:
# Set up a text-gen Pipeline using transformers

textgen_pipeline = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # model parameters
    temperature=TEMPERATURE, #do_sample=True, 
    max_new_tokens=1024,  
    repetition_penalty=1.15  # prevent output repetitions
)

#### Optional Step for fun and some (in)sanity check

In [None]:
# test the pipeline for text generation using LLM

res = textgen_pipeline(
"""
<s>[INST] Simulate three brilliant, logical experts collaboratively answering a question. 
Each one carefully thinks and verbosely explains their thought process in real-time, considering the 
prior explanations of others and openly acknowledging mistakes. At each step, whenever possible, 
each expert refines and builds upon the thoughts of others, acknowledging their contributions. 
They continue until there is a definitive answer to the question. After all experts have provided 
their analysis, you then analyze all 3 analyses and provide either the consensus solution or 
your best guess solution. Keep your response in less than 400 words.[/INST] The question is...

Bob is in the living room.
He walks to the kitchen, carrying a cup.
He puts a ball in the cup and carries the cup to the bedroom.
He turns the cup upside down, then walks to the garden.
He puts the cup down in the garden, then walks to the garage.</s>
[INST]Where is the ball?[/INST]
"""
)

print(res[0]["generated_text"])


In [None]:
# Please the Langchain Gods and wrap textgen pipeline with the LangChain HuggingFace wrapper. 

llm = HuggingFacePipeline(pipeline=textgen_pipeline)

In [None]:
# RAG prompt

prompt = hub.pull("rlm/rag-prompt-llama")

#### Llama Prompt for reference 

```python
[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> 
Question: {question} 
Context: {context} 
Answer: [/INST]
    
ChatPromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, template="[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]", template_format='f-string', validate_template=True), additional_kwargs={})])


In [None]:
# Set Up RetrievalQA Chain with Maximal Marginal Relevance (MMR) for text summarization
# and for reducing redundancy and increasing diversity in the search results. 

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # try other chain types as well. refine, map_reduce, map_rerank
    retriever=db.as_retriever(search_type="mmr", search_kwargs={'fetch_k': 3}), # MMR search will first fetch a pool of 3 or less documents, and then select the most diverse and relevant documents from this pool. 
    return_source_documents=True,  # verbose=True,
    chain_type_kwargs={"prompt": prompt}
)

#### Cells below are for Q&A. 
Just update the question or create a new cell below and copy pasta the cell content to run it

In [None]:
question = "The document is the resume of a Cloud FullStack developer named Jegan S. What is the name of the person in the document?" 
output = qa_chain({"query": question})
output["result"]

In [None]:
question = "What are Jegan's strongest skills?"
result = qa_chain({"query": question})
result["result"]

In [None]:
question = "Summarize Jegan's skills and experience in less than 200 words."
output = qa_chain({"query": question})
output["result"]

In [None]:
### Enter the question -- Intelligent Summarization -- worked

question = """
Extract the following details about Jegan Sakthivel from the document:

Name,
Email,
Phone Number,
Key Skills,
Certifications or Badges,
Summary of Work Experience

and provide these details in the following format in no more than 250 words

Name:
Email:
Phone Number:
Key Skills:
Certifications or Badges:
Summary of Work Experience:
"""

# Get the completion. Confirm it's not a hallucination by cross-checking with the resume.
output = qa_chain({"query": question})
print("\n> ", output["result"])


##### DON'T CHANGE or DELETE the function call below #####

# Use the write_summary function to write resume summaries every time a question is asked
write_summary(question, output["result"], SUMMARY_DIRECTORY, SUMMARY_DOCUMENT)


__Optional Step__

In [None]:
# Convert the result to Markdown and display it
display(Markdown(f"**Result:**\n\n{result}"))

#### Extras below not needed right now

In [None]:
# mistral model for 128K context -- overkill for the job. not using

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Yarn-Mistral-7B-128k-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt = "Tell me about AI"
prompt_template=f'''{prompt}
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])