In [39]:
import json
import os
import sys
import numpy as np
import time
import urllib.request
import traceback
from transformers import LlamaForCausalLM, LlamaTokenizer
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import BSHTMLLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from bs4 import BeautifulSoup
import requests
import lxml
import logging

import src.analyse as al

In [2]:
a = al.Analyse('prod','nymex crude oil',2, 2)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
from_string grammar:



root ::= arr 
arr ::= [[] [<U+000A>] ws arr_12 []] 
value ::= object | array | string | number | value_7 ws 
object ::= [{] ws object_16 [}] ws 
array ::= [[] ws array_20 []] ws 
string ::= ["] string_23 ["] ws 
number ::= number_24 number_30 number_34 ws 
value_7 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= ws_36 
arr_9 ::= value arr_11 
arr_10 ::= [,] [<U+000A>] ws value 
arr_11 ::= arr_10 arr_11 | 
arr_12 ::= arr_9 | 
object_13 ::= string [:] ws value object_15 
object_14 ::= [,] ws string [:] ws value 
object_15 ::= object_14 object_15 | 
object_16 ::= object_13 | 
array_17 ::= value array_19 
array_18 ::= [,] ws value 
array_19 ::= array_18 array_19 | 
array_20 ::= array_17 | 
string_21 ::= [^"\] | [\] string_22 
string_22 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_23 ::= string_21 string_23 | 
number_24 ::= number_25 number_26 
number_25 ::= [-] | 
number_26 ::= [0-9] | [1-9] number_27 
number_27 ::= [0-9] number_27 | 
numb

In [None]:
x = 1
x

In [46]:
TEMPLATE_A = """
Answer the question at the end using the following context. Answer the question by showing a list summary.
<context>
{context}
</context>
Q: {question}? A:
"""
TEMPLATE_B = """
Human: Use the following pieces of context to provide a concise answer to the question at the end. Answer in the form of a json dictionary with the keys: "effect", "confidence" and "explanation".Don't include lists, apostrophes or quotes in any part of the answer.
<context>
{context}
</context>
Question: {question}
Assistant:"""
TEMPLATE_C = """
Human: Use the following pieces of context to provide a concise answer to the question at the end. Answer in the form of a json dictionary with the keys: "change", "confidence" and "explanation".Don't include lists, apostrophes or quotes in any part of the answer.
<context>
{context}
</context>
Question: {question}
Assistant:"""

In [42]:
def CreateVectorDB(embeddings, config_file, analysis_id):
    try:
        vector_db_path = os.path.join(os.getcwd(), 'output', config_file, 'html', analysis_id)
        vector_db_path += '\\'
        logging.info('loading vector db at directory: ' + vector_db_path)
        loader = DirectoryLoader(vector_db_path, loader_cls=BSHTMLLoader, loader_kwargs = {'open_encoding':'utf8'})
        documents = loader.load()
        logging.info('splitting documents')
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 750, chunk_overlap = 50)
        docs = text_splitter.split_documents(documents)
        logging.info('generating embeddings')
        vectorstore_faiss = FAISS.from_documents(docs, embeddings)
        return vectorstore_faiss
    except Exception as e:
        logging.exception("error generating embeddings")
        logging.exception(traceback.format_exc())
        raise Exception(e) 

In [48]:
subject = 'nymex crude oil'
analysis_id = subject.replace(' ', '_')
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
vectorstore_faiss = CreateVectorDB(embeddings, 'prod', analysis_id)

In [49]:

def LoadContextAndRunLLM(query, template, vectorstore_faiss, config_file, analysis_id):
    prompt = PromptTemplate(template = template, input_variables = ["context", "question"])
    model_path = os.path.join(os.getcwd(), 'models', 'llama-2-13b-chat.Q8_0.gguf')
    grammar_path = os.path.join(os.getcwd(), 'models', 'json_arr.gbnf')
    llm = LlamaCpp(model_path=model_path, 
        temperature=0.0, 
        top_p=1, 
        n_ctx=4096, 
        verbose=True, 
        n_gpu_layers=40,
        n_batch=512,
        grammar_path=grammar_path
    )
    
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type='stuff',
        retriever=vectorstore_faiss.as_retriever(search_type="similarity", search_kwargs={"k":3}),
        return_source_documents = False,
        chain_type_kwargs = {"prompt":prompt}
    )
    return qa({"query": query})

In [50]:
init_answer = LoadContextAndRunLLM('what are the 5 biggest factors that effect the price of nymex crude oil', TEMPLATE_A, vectorstore_faiss, 'prod', analysis_id)
init_answer

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
from_string grammar:



root ::= arr 
arr ::= [[] [<U+000A>] ws arr_12 []] 
value ::= object | array | string | number | value_7 ws 
object ::= [{] ws object_16 [}] ws 
array ::= [[] ws array_20 []] ws 
string ::= ["] string_23 ["] ws 
number ::= number_24 number_30 number_34 ws 
value_7 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= ws_36 
arr_9 ::= value arr_11 
arr_10 ::= [,] [<U+000A>] ws value 
arr_11 ::= arr_10 arr_11 | 
arr_12 ::= arr_9 | 
object_13 ::= string [:] ws value object_15 
object_14 ::= [,] ws string [:] ws value 
object_15 ::= object_14 object_15 | 
object_16 ::= object_13 | 
array_17 ::= value array_19 
array_18 ::= [,] ws value 
array_19 ::= array_18 array_19 | 
array_20 ::= array_17 | 
string_21 ::= [^"\] | [\] string_22 
string_22 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_23 ::= string_21 string_23 | 
number_24 ::= number_25 number_26 
number_25 ::= [-] | 
number_26 ::= [0-9] | [1-9] number_27 
number_27 ::= [0-9] number_27 | 
numb

{'query': 'what are the 5 biggest factors that effect the price of nymex crude oil',
 'result': '[\n"Global supply and demand",\n"Economic growth",\n"Geopolitical events",\n"OPEC decisions",\n"Currency fluctuations"\n]'}

In [8]:
print(init_answer)

{'query': 'what factors effect the price of nymex crude oil', 'result': '[\n"Current 2 biggest factors",\n"effect the price"\n]'}


In [None]:
x = 1
x