# Load HR policy documents, split them, create embeddings and store them in a database

## Loading documents

In [None]:
!pwd

In [None]:
import os
import re
from langchain_community.document_loaders import (UnstructuredWordDocumentLoader, UnstructuredPDFLoader)
from langchain.text_splitter import RecursiveCharacterTextSplitter


r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400
)

folder_path = 'documents'
all_files = os.listdir(folder_path)
filtered_files = [os.path.join(folder_path, f) for f in all_files if re.search("(.pdf|.docx)$", f, re.IGNORECASE)]

def load_elements_from_file(file_path):
    if file_path.lower().endswith('.pdf'):
        loader = UnstructuredPDFLoader(file_path, strategy="hi_res")
    elif file_path.lower().endswith('.docx'):
        loader = UnstructuredWordDocumentLoader(file_path, strategy="hi_res")
    else:
        return None  # Ignore files that are not PDF or DOCX

    data = loader.load_and_split(text_splitter=r_splitter)
    # data = loader.load()
    return data


In [None]:
splits = []
for f in filtered_files:
    elements = load_elements_from_file(f)
    splits.extend(elements)

In [None]:
len(splits)

In [None]:
print(splits[2])

## Read OPENAI_API_KEY

In [None]:
import os
import openai
import langchain_openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
from langchain_chroma import Chroma

In [None]:
persist_directory = 'chromadb/'

In [None]:
!rm -rf persist_directory  # remove old database files if any

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
# from langchain_chroma import Chroma
# vectordb = Chroma(embedding_function=embedding, persist_directory=persist_directory)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "Est-ce qu'il y a des heures pour utiliser la salle de fitness?"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
# docs = vectordb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [None]:
docs

In [None]:
from langchain.chains import RetrievalQA

In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [None]:
from langchain.globals import set_debug

set_debug(True)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
result = qa_chain.invoke({"query": question})

In [None]:
print(result["result"])

In [None]:
mydict = {
  "prompts": [
    "System: Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\nThe file mysealedsecret.[json|yaml] is a commitable file.\n\nIf you would rather not need access to the cluster to generate the sealed secret you can run:\n\n    kubeseal \\\n\n      --controller-name=sealed-secrets-controller \\\n\n      --controller-namespace=kube-system \\\n\n      --fetch-cert > mycert.pem\n\nto retrieve the public cert used for encryption and store it locally. You can then run 'kubeseal --cert mycert.pem' instead to use the local cert e.g.\n\n    kubectl create secret generic secret-name --dry-run=client --from-literal=foo=bar -o [json|yaml] | \\\n\n    kubeseal \\\n\n      --controller-name=sealed-secrets-controller \\\n\n      --controller-namespace=kube-system \\\n\n      --format [json|yaml] --cert mycert.pem > mysealedsecret.[json|yaml]\n\n3. Apply the sealed secret\n\n    kubectl create -f mysealedsecret.[json|yaml]\n\nRunning 'kubectl get secret secret-name -o [json|yaml]' will show the decrypted secret that was generated from the sealed secret.\n\nBoth the SealedSecret and generated Secret must have the same name and namespace.\n\nInstall client\n\n$ wget https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.24.4/kubeseal-0.24.4-linux-amd64.tar.gz\n\nUtilisation\n\n$ kubeseal -f postgres-pass-secret.yaml -w postgres-pass-secret-sealed.yaml\n\nMettre le secret dans .gitignore pour qu'il ne soit pas versionné. Le sealed secret par contre est encrypté et peut être versionné.\n\nUtilisation\n\n$ kubeseal -f postgres-pass-secret.yaml -w postgres-pass-secret-sealed.yaml\n\nMettre le secret dans .gitignore pour qu'il ne soit pas versionné. Le sealed secret par contre est encrypté et peut être versionné.\n\nDans le fichier kustomize.yaml mettre le sealed secret. A l'instanciation dans le cluster il sera transformé localement en secret utilisable dans les déploiements.\n\nAttention un secret n'est valable que pour un namespace.\n\nInstallation Nginx-Ingress (TLS)\n\n# Création du namespace\n\n$ k create ns nginx-ingress\n\n# Création du certificat TLS (en dry run pour garder le fichier)\n\n$ kubectl create secret tls star-marvinpac-com --key=star_marvinpac_com.key --cert=star_marvinpac_com.crt --dry-run=client -o yaml > mvp-tls-secret.yaml\n\n# Importation du certificat\n\n$ k apply -n nginx-ingress -f mvp-tls-secret.yaml\n\n# Supprimer le helm chart s'il existe déjà\n\n$ helm uninstall ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx\n\n# Installer le helm chart\n\n$ helm install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx  --set controller.wildcardTLS.cert=ingress-nginx/star-marvinpac-com --set controller.service.loadBalancerIP=192.168.77.149 --set controller.config.force-ssl-redirect=\"true\"\n\n# Vérifier que la config force-ssl-redirect\n\n$ k get cm -n nginx-ingress ingress-nginx-controller -o yaml\n\nAn example Ingress that makes use of the controller:\n\n  apiVersion: networking.k8s.io/v1\n\n  kind: Ingress\n\n  metadata:\n\n    name: example\n\nTo get the password for \"repmgr\" run:\n\n    export REPMGR_PASSWORD=$(kubectl get secret --namespace analytics postgres-ha-postgresql-ha-postgresql -o jsonpath=\"{.data.repmgr-password}\" | base64 -d)\n\nTo connect to your database run the following command:\n\n    kubectl run postgres-ha-postgresql-ha-client --rm --tty -i --restart='Never' --namespace analytics --image docker.io/bitnami/postgresql-repmgr:16.3.0-debian-12-r11 --env=\"PGPASSWORD=$POSTGRES_PASSWORD\"  \\\n\n        --command -- psql -h postgres-ha-postgresql-ha-pgpool -p 5432 -U postgres -d postgres\n\nTo connect to your database from outside the cluster execute the following commands:\n\n  NOTE: It may take a few minutes for the LoadBalancer IP to be available.\n\n        Watch the status with: 'kubectl get svc --namespace analytics -w postgres-ha-postgresql-ha-pgpool\n\n    export SERVICE_IP=$(kubectl get svc --namespace analytics postgres-ha-postgresql-ha-pgpool --template \"{{ range (index .status.loadBalancer.ingress 0) }}{{ . }}{{ end }}\")\n\n    PGPASSWORD=\"$POSTGRES_PASSWORD\" psql -h $SERVICE_IP -p 5432  -U postgres -d postgres\n\nWARNING: There are \"resources\" sections in the chart not set. Using \"resourcesPreset\" is not recommended for production. For production installations, please set the following values according to your workload needs:\n\n- pgpool.resources\n\n- postgresql.resources\n\n- witness.resources\n\n+info https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/\n\nK8S Find unmounted PVCs\n\ncsi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph\n\n  csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node\n\n  csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph\n\nreclaimPolicy: Delete\n\nInstall sealed-secrets\n\nInstall server\n\n$ helm repo add sealed-secrets https://bitnami-labs.github.io/sealed-secrets\n\n$ help repo update\n\n$ helm install sealed-secrets -n kube-system --set-string fullnameOverride=sealed-secrets-controller sealed-secrets/sealed-secrets\n\nNAME: sealed-secrets\n\nLAST DEPLOYED: Thu Nov 16 12:49:03 2023\n\nNAMESPACE: kube-system\n\nSTATUS: deployed\n\nREVISION: 1\n\nTEST SUITE: None\n\nNOTES:\n\n** Please be patient while the chart is being deployed **\n\nYou should now be able to create sealed secrets.\n\n1. Install the client-side tool (kubeseal) as explained in the docs below:\n\n    https://github.com/bitnami-labs/sealed-secrets#installation-from-source\n\n2. Create a sealed secret file running the command below:\n\n    kubectl create secret generic secret-name --dry-run=client --from-literal=foo=bar -o [json|yaml] | \\\n\n    kubeseal \\\n\n      --controller-name=sealed-secrets-controller \\\n\n      --controller-namespace=kube-system \\\n\n      --format yaml > mysealedsecret.[json|yaml]\n\nThe file mysealedsecret.[json|yaml] is a commitable file.\n\nIf you would rather not need access to the cluster to generate the sealed secret you can run:\n\n    kubeseal \\\n\n      --controller-name=sealed-secrets-controller \\\n\n      --controller-namespace=kube-system \\\nHuman: How do I seal a password using kubeseal?"
  ]
}

In [None]:
print(mydict["prompts"][0])

In [None]:
el = load_elements_from_file("C:/Users/olivier.boudry/Downloads/KB-214.pdf")