## Quick OpenAI API Test

Make sure there are `OPENAI_API_KEY` and `LAKERA_API_KEY` environment variables!

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from langchain.llms import OpenAI

llm = OpenAI()

In [None]:
# Mic check 1-2-1-2
llm.predict("hi!")

## Load Data

In [6]:
from langchain.document_loaders import JSONLoader
from pathlib import Path


data_path = Path(
    "/mnt/c/Users/davis/OneDrive/Documents/grabeklis/data/lsm_articles_all_20230924.json"
)

loader = JSONLoader(
    file_path=data_path,
    jq_schema=".[]",
    text_content=False,
)

documents = loader.load()

In [None]:
documents[0]

## Embed Data

In [7]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [8]:
from pathlib import Path
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings

path_cache = Path("../cache/").resolve()
fs = LocalFileStore(path_cache)

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings, fs, namespace=embeddings.model
)

In [11]:
# Cache is empty prior to embedding
list(fs.yield_keys())[:5]

['text-embedding-ada-002db5db37f-6f41-5544-ba98-2b8d0871aaab',
 'text-embedding-ada-0022ccce7e7-dae9-55c8-a087-8b94ce5409a9',
 'text-embedding-ada-0029f93beaf-62ac-53ae-ae2a-522a55801eb9',
 'text-embedding-ada-002481cff6d-c5cf-5766-80ad-56138b4c8c4d',
 'text-embedding-ada-0022cd20271-690b-5123-99d4-2e802ddf4830']

In [12]:
from langchain.vectorstores import FAISS

db = FAISS.from_documents(documents[:2000], cached_embedder)

In [13]:
# Cache is empty prior to embedding
len(list(fs.yield_keys()))

2000

## Retriever

From: https://python.langchain.com/docs/modules/data_connection/retrievers/

In [14]:
retriever = db.as_retriever()

In [15]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


llm = OpenAI()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [62]:
query = """Atlasi rakstus, kur ir pieminēts Latvijas valsts prezidents!
Norādi raksta virsrakstu, publicēšanas datumu un linku uz rakstu pēc formāta:

Virsrakts:
Publicēšanas datums:
Links:
"""

result = qa.run(query)

In [63]:
result

'\nVirsraksts: Vēsturnieks Krūmiņš: Tautas vēlēts prezidents – vai nācija gatava dot vairāk pilnvaru vienam cilvēkam?\nPublicēšanas datums: 4. augusts, 13:10\nLinks: https://www.lsm.lv/raksts/zinas/latvija/04.08.2023-vesturnieks-krumins-tautas-velets-prezidents-vai-nacija-gatava-dot-vairak-pilnvaru-vienam-cilvekam.a518958/'

In [17]:
query = "Kurā datumā ir publicēts raksts par Gruzijas un Dienvidosetijas separātistiem?"
qa.run(query)

' 3. augusts, 2008'

In [16]:
query = "Vai šajos dokumentos ir kaut kas minēts par Gruziju?"
qa.run(query)

' Jā, pirmajā dokumentā ir minēts, ka Krievija ir aicinājusi Gruzijas un Dienvidosetijas separātistus panākt miermīlīgu risinājumu militāram konfliktam.'

In [15]:
documents[0]

Document(page_content='{"url": "https://www.lsm.lv/raksts/zinas/arzemes/simtiem-cilveku-beg-no-apsaudem-dienvidosetija-pastav-bazas-par-kara-izcelsanas-iespeju.a50961/", "publish_date": "3. augusts, 2008, 11:50", "category": "Pasaul\\u0113", "title": "Simtiem cilv\\u0113ku b\\u0113g no ap\\u0161aud\\u0113m Dienvidosetij\\u0101, past\\u0101v ba\\u017eas par kara izcel\\u0161an\\u0101s iesp\\u0113ju", "lead": "Krievija aicin\\u0101jusi Gruzijas un Dienvidosetijas separ\\u0101tistus pan\\u0101kt mierm\\u012bl\\u012bgu risin\\u0101jumu milit\\u0101ram konfliktam, kura d\\u0113\\u013c nakt\\u012b uz sv\\u0113tdienu g\\u0101ja boj\\u0101 se\\u0161i cilv\\u0113ki.", "article": "Krievija aicin\\u0101jusi Gruzijas un Dienvidosetijas separ\\u0101tistus pan\\u0101kt mierm\\u012bl\\u012bgu risin\\u0101jumu milit\\u0101ram konfliktam, kura d\\u0113\\u013c nakt\\u012b uz sv\\u0113tdienu g\\u0101ja boj\\u0101 se\\u0161i cilv\\u0113ki. No Gruzijas ap\\u0161aud\\u012btaj\\u0101m Dienvidosetijas pils\\u

## Lakera Guard

In [16]:
import os
import requests

# takes in a query as input and returns Lakera Guard's feedback on prompt injection, jailbreaking, etc.
def lakera_guard(query:str):
    response = requests.post(
        "https://api.lakera.ai/v1/guard",
        json={"input": query},
        headers={"Authorization": f"Bearer {os.environ.get('LAKERA_API_KEY')}"},
    )
    answer = response.json()
    return answer

In [54]:
class LakeraGuardException(Exception):
    def __init__(self, message):
        self.message = message
        super().__init__(self.message)

class LakeraGuardHackException(LakeraGuardException):
    def __init__(self, message="Tika identificēts uzlaušanas mēģinājums."):
        self.message = message
        super().__init__(self.message)
        
class LakeraGuardUrlException(LakeraGuardException):
    def __init__(self, message="Jautājums nedrīkst saturēt linkus."):
        self.message = message
        super().__init__(self.message)
        
class LakeraGuardPiiException(LakeraGuardException):
    def __init__(self, message="Tika identificēts personu datu pārkāpums(-i)"):
        self.message = message
        super().__init__(self.message)

def lakera_guard_chain(input_dict):
    answer = lakera_guard(input_dict["query"])
    input_dict.update({"Lakera_Guard": answer})
    # return input_dict  # pass input further
    
    results = answer["results"][0]
    cats = results['categories']
    
    trigger_pii = False
    if 'pii' in results['payload']:
        pii_trigers = [entity['entity_type'] for entity in results['payload']['pii']]
        if 'EMAIL_ADDRESS' in pii_trigers:
            trigger_pii = True
        if 'CREDIT_CARD' in pii_trigers:
            trigger_pii = True
        if 'IP_ADDRESS' in pii_trigers:
            trigger_pii = True
        if 'ADDRESS' in pii_trigers:
            trigger_pii = True
        if 'PHONE_NUMBER' in pii_trigers:
            trigger_pii = True
            
    if cats['jailbreak'] or cats['prompt_injection']: 
        # AI security issue detected
        raise LakeraGuardHackException(f"Tika identificēts uzlaušanas mēģinājums: {results['payload']}")
    elif cats['unknown_links']:
        raise LakeraGuardUrlException(f"Tika identificēts nepazīstams links: {results['payload']['unknown_links']}")
    elif trigger_pii:
        return LakeraGuardPiiException(f"Tika identificēts personu datu pārkāpums(-i): {results['payload']['pii']}")
    else: 
        # no AI security issue detected
        return input_dict  # pass input further

In [55]:
secured_QA_chain = lakera_guard_chain | qa
query = "Vai 378282246310005 kāda alex@bloomberg.uk rakstā 109.202.218.238 tiek pieminēts +41757587256 Aivars Lembergs 1501 Skyland Blvd E Tuscaloosa AL 35405?"

try:
    res = secured_QA_chain.invoke({"query": query})
except LakeraGuardException as lge:
    res = {'query': query, 'result': lge.message}

In [56]:
res

{'query': 'Vai 378282246310005 kāda alex@bloomberg.uk rakstā 109.202.218.238 tiek pieminēts +41757587256 Aivars Lembergs 1501 Skyland Blvd E Tuscaloosa AL 35405?',
 'result': "Tika identificēts nepazīstams links: [{'link': 'bloomberg.uk', 'domain': 'bloomberg.uk', 'path': ''}]"}