In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)

file = client.files.create(
  file=open("llama2.pdf", "rb"),
  purpose='assistants'
)

assistant = client.beta.assistants.create(
  instructions="你是个问答机器人，你根据给定的知识回答用户问题。",
  model="gpt-4-1106-preview",
  tools=[{"type": "retrieval"}],
  file_ids=[file.id]
)

thread = client.beta.threads.create()
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="Llama 2"
)

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="请用中文回答用户的问题。",
)

while run.status != "completed":
    run = client.beta.threads.runs.retrieve(
      thread_id=thread.id,
      run_id=run.id
    )

messages = client.beta.threads.messages.list(
  thread_id=thread.id
)

for turn in reversed(messages.data):
    print(f"{turn.role.upper()}: "+turn.content[0].text.value)

In [None]:
!pip install pdfminer.six

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [None]:
def extract_text_from_pdf(filename, page_numbers=None, min_line_length=1):
    paragraphs = []
    buffer = ''
    full_text = ''
    for i, page_layout in enumerate(extract_pages(filename)):
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    lines = full_text.split('\n')
    for text in lines:
        if len(text) >= min_line_length:
            buffer += (' '+text) if not text.endswith('-') else text.strip('-')
        elif buffer:
            paragraphs.append(buffer)
            buffer = ''
    if buffer:
        paragraphs.append(buffer)
    return paragraphs

In [None]:
paragraphs = extract_text_from_pdf("llama2.pdf", min_line_length=10)

In [None]:
!pip install elasticsearch7
!pip install nltk

In [None]:
from elasticsearch7 import Elasticsearch, helpers
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re

import warnings
warnings.simplefilter("ignore") 
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def to_keywords(input_string):
    no_symbols = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_string)
    word_tokens = word_tokenize(no_symbols)
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    filtered_sentence = [ps.stem(w)
                         for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

In [None]:
es = Elasticsearch(
    hosts=['ip address'], 
    http_auth=("elastic", "password"), 
)

index_name = "string_index"
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name)

actions = [
    {
        "_index": index_name,
        "_source": {
            "keywords": to_keywords(para),
            "text": para
        }
    }
    for para in paragraphs
]

helpers.bulk(es, actions)

In [None]:
def search(query_string, top_n=3):
    search_query = {
        "match": {
            "keywords": to_keywords(query_string)
        }
    }
    res = es.search(index=index_name, query=search_query, size=top_n)
    return [hit["_source"]["text"] for hit in res["hits"]["hits"]]

In [None]:
results = search("how many parameters does llama 2 have?", 2)
for r in results:
    print(r+"\n")

In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) 

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content

In [None]:
def build_prompt(prompt_template, **kwargs):
    prompt = prompt_template
    for k, v in kwargs.items():
        if isinstance(v, str):
            val = v
        elif isinstance(v, list) and all(isinstance(elem, str) for elem in v):
            val = '\n'.join(v)
        else:
            val = str(v)
        prompt = prompt.replace(f"__{k.upper()}__", val)
    return prompt

In [None]:
user_query = "how many parameters does llama 2 have?"

search_results = search(user_query, 2)

prompt = build_prompt(prompt_template, info=search_results, query=user_query)
print(prompt)

response = get_completion(prompt)
print(response)