In [None]:
# !pip install -qq boto3
# !pip install -qq openai
# !pip install -qq langchain
# !pip install -qq python-dotenv
# !pip install -qq -U langchain-openai
# !pip install -qq chromadb
# !pip install -qU langchain-text-splitters
# !pip install -qq lark

In [1]:
import os
import openai
import boto3
import logging

from dotenv import load_dotenv, find_dotenv

from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers.self_query.chroma import ChromaTranslator
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser

In [2]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
_ = load_dotenv(find_dotenv()) 

openai.api_key  = os.environ['OPENAI_API_KEY']

# Extract data

In [10]:
# configs
bucket_name = "plarosa-portfolio-bot"
s3_folder_resume = "raw/resume/"
repo_name = "curriebot"
chroma_directory = "vector_db"
root_dir = os.path.expanduser("~")


In [11]:
def download_s3_folder(bucket_name, s3_folder, local_dir):
    # Create the local directory if it doesn't exist
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
    
    # Initialize the S3 client
    s3 = boto3.client('s3')
    
    # List all objects in the specified S3 folder
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
    
    # Download each object individually
    if 'Contents' in response:
        for obj in response['Contents']:
            file_key = obj['Key']
            # Skip directories
            if file_key.endswith('/'):
                continue
            # Define local file path
            local_file_path = os.path.join(local_dir, file_key.replace(s3_folder, '').lstrip('/'))
            # Create any necessary subdirectories
            local_subdir = os.path.dirname(local_file_path)
            if not os.path.exists(local_subdir):
                os.makedirs(local_subdir)
            # Download the file
            print(f'Downloading {file_key} to {local_file_path}')
            s3.download_file(bucket_name, file_key, local_file_path)


In [12]:

local_dir_resume = os.path.join(root_dir, repo_name, s3_folder_resume)

if not os.path.isdir(local_dir_resume):
    os.makedirs(local_dir_resume)

download_s3_folder(bucket_name, s3_folder_resume, local_dir_resume)

Downloading raw/resume/La Rosa, Patrick - CV.md to /home/ubuntu/curriebot/raw/resume/La Rosa, Patrick - CV.md


# Transform data

In [13]:
resume_fname = os.listdir(local_dir_resume)
resume_path = os.path.join(local_dir_resume, resume_fname[0])

In [14]:
with open(resume_path, "r", encoding="utf-8") as file:
    resume_md = file.read()

In [15]:

headers_to_split_on = [
    ("#", "Category"),
    ("##", "Company Name")
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(resume_md)

# Char-level split
chunk_size = 800
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)

# Load data

In [16]:
embedding = OpenAIEmbeddings()

In [17]:
vector_db_path = os.path.join(root_dir, repo_name, chroma_directory)

In [18]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=vector_db_path
)
vectordb.persist()

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
  warn_deprecated(
