In [11]:
import joblib
import json
from typing import List
import sqlite3
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
import ollama
import unicodedata
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

In [12]:
class DataBaseLoader():
    def __init__(self,file_path,table_name):
        self.file_path = file_path
        self.table_name = table_name

        self.conn = sqlite3.connect(self.file_path)
        self.cursor = self.conn.cursor()

    def load(self) -> pd.DataFrame:
        query = f"SELECT * FROM {self.table_name}"
        df = pd.read_sql_query(query, self.conn)

        documents = []
        for idx,row in df.iterrows():
            content = row['jobDetails']
            metadata = {
                "url":row['jobLink'],
                "title":row['jobTitle']
            }
            documents.append(Document(page_content=content,metadata=metadata))

        return documents

    

dbLoader = DataBaseLoader('../data/db.db','job_data')
documents = dbLoader.load()

In [17]:
class OllamaEmbeddings(Embeddings):
    def __init__(self,model="mxbai-embed-large"):
        self.model=model

    def embed_documents(self,texts):
        return [self.embed_query(text) for text in texts]
    
    def embed_query(self,text):
        response = ollama.embeddings(model=self.model,prompt=text.replace(u'\u00a0',u' '))
        return response["embedding"]

In [14]:
text_splitter = CharacterTextSplitter(chunk_size=200,chunk_overlap=10)
texts=text_splitter.split_documents(documents)

In [15]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    separators=["\n\n","\n"," ",""]
)
texts = text_splitter.split_documents(documents)

#### To use ollama embeddings
- Downloaded ollama locally https://github.com/ollama/ollama?tab=readme-ov-file
- pulled model locally > ollama pull mxbai-embed-large

In [19]:
embeddings = OllamaEmbeddings()

vector_store = FAISS.from_documents(texts,embeddings)

In [20]:
retriever = vector_store.as_retriever()
retriever = vector_store.as_retriever(search_type='mmr',search_kwargs={'k':10})

In [25]:
import PyPDF2
import re

In [30]:
def read_resume(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]','',text)
    text = re.sub(r'\xa0','',text)
    return text

resume_text = clean_text(read_resume('../data/MasonLee_Resume.pdf'))

In [32]:
retriever.invoke(resume_text)

[Document(metadata={'url': 'https://www.indeed.com/rc/clk?jk=25fc39674dc43d5e&bb=DGUfQVEJ0zRJcR3fZxzGnlHCVniLNORnl3NL-XTA4-i-jGsfT4xyvTsupoBS6hjUvqnXfYCDkzgAq2L78uwsdN4quFrWnXx-zPq9R2H4odjrRWSW3-kPGe1FMlmq7PGH&xkcb=SoDA67M36bD1-7wv6Z0FbzkdCdPP&fccid=4f46a80a0b4401fe&vjs=3', 'title': 'Staff Data Engineer'}, page_content='and determine the desired insights to extract. The ideal candidate will excel in translating business requirements into a technical roadmap and developing remarkable solutions to satisfy those needs.    Educational BackgroundB.S. degree in Computer Science, Software Engineering, Electrical Engineering, Bioengineering, or related technical fields involving algorithms or coding (e.g., Physics or Mathematics). Professional Experience10+ years of data engineering / software development experience'),
 Document(metadata={'url': 'https://www.indeed.com/rc/clk?jk=21f38ec02c730eb6&bb=eMSLhjA9uv7WNxvHP6iyNhX9k1k5K5nU2ObTUPP9GTlopBrdguSKxTKxbehRmKYRCP03xIjftNxlAhVwSTda5JvvdDa8H1c7