In [11]:
import joblib
import json
from typing import List
import sqlite3
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
import ollama
import unicodedata
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

In [33]:
class DataBaseLoader():
    def __init__(self,file_path,table_name):
        self.file_path = file_path
        self.table_name = table_name

        self.conn = sqlite3.connect(self.file_path)
        self.cursor = self.conn.cursor()

    def load(self) -> pd.DataFrame:
        query = f"SELECT * FROM {self.table_name}"
        df = pd.read_sql_query(query, self.conn)

        documents = []
        for idx,row in df.iterrows():
            content = row['jobDetails']
            metadata = {
                "url":row['jobLink'],
                "title":row['jobTitle']
            }
            documents.append(Document(page_content=content,metadata=metadata))

        return documents

    

dbLoader = DataBaseLoader('../data/db.db','job_data')
documents = dbLoader.load()

In [34]:
class OllamaEmbeddings(Embeddings):
    def __init__(self,model="mxbai-embed-large"):
        self.model=model

    def embed_documents(self,texts):
        return [self.embed_query(text) for text in texts]
    
    def embed_query(self,text):
        response = ollama.embeddings(model=self.model,prompt=text.replace(u'\u00a0',u' '))
        return response["embedding"]

In [35]:
text_splitter = CharacterTextSplitter(chunk_size=200,chunk_overlap=10)
texts=text_splitter.split_documents(documents)

In [36]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    separators=["\n\n","\n"," ",""]
)
texts = text_splitter.split_documents(documents)

#### To use ollama embeddings
- Downloaded ollama locally https://github.com/ollama/ollama?tab=readme-ov-file
- pulled model locally > ollama pull mxbai-embed-large

In [37]:
embeddings = OllamaEmbeddings()

vector_store = FAISS.from_documents(texts,embeddings)

In [38]:
retriever = vector_store.as_retriever()
retriever = vector_store.as_retriever(search_type='mmr',search_kwargs={'k':10})

In [25]:
import PyPDF2
import re

In [39]:
def read_resume(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]','',text)
    text = re.sub(r'\xa0','',text)
    return text

resume_text = clean_text(read_resume('../data/MasonLee_Resume.pdf'))

In [40]:
retriever.invoke(resume_text)

[Document(metadata={'url': 'https://www.indeed.com/rc/clk?jk=12cb745cf83b9368&bb=ya6PoQ6fnjkZ3am6mlWSEpBod3lA1vGvNqCGgtBUPojdygrrsSuACEpMz169Y-xLACSylxal7sLSIPjO_IE_DjRz-P56-TezCSvosIRPPJvxRtrY5l2XEQ%3D%3D&xkcb=SoDr67M36galKCxwTR0JbzkdCdPP&fccid=c1099851e9794854&vjs=3', 'title': 'Senior Data Scientist, Input Experience Analytics'}, page_content='of algorithms like Inline Prediction and Proofread Understanding feature usage, such as engagement with Image Playground Mining user feedback data to identify model soft spots  Master’s degree or PhD in Data Science, Statistics, Engineering or other technical field, plus 7+ years of proven experience building data-driven solutions to solve business problems, especially around launching of new products and servicesSolid experience with computer vision machine learningA deep passion for'),
 Document(metadata={'url': 'https://www.indeed.com/rc/clk?jk=da9280a719bdfee6&bb=Epx1km5SYOtU0_EeDU_HPe6LN3WNxWQ61htrtxdNEBNZWN31plc3I3wB4tAlnSOboHHMSbV37yitnF