In [2]:
import os
import sys
import requests
from typing import Any, List, Mapping, Optional
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

In [3]:
loader = PyPDFLoader("./data-preprocessing-for-imbalanced-data.pdf")
pages = loader.load()

In [4]:
len(pages)

22

In [5]:
def document_processor(doc):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    chunks = text_splitter.split_text(doc)

    emb = HuggingFaceEmbeddings()
    knowledgeBase = FAISS.from_texts(chunks, emb)

    return knowledgeBase

In [6]:
pages[0]

Document(page_content=' \n  \nA Study of Data Pre -processing Techniques for Imbalanced Biomedical \nData Classification  \nShigang  Liu*; Jun  Zhang *; Yang Xiang; Wanlei Zhou; Dongxi Xiang \nAddress:  \nDepartment of Computer Science and Software Engineering , Swinburne University of \nTechnology, Hawthorn , VIC 3122, Australia , email:  shigangliu@swin.edu.au \nDepartment of Computer Science and Software Engineering , Swinburne University of \nTechnology, Hawthorn , VIC 3122, Australia , email: junzhang@swin.edu.au \nDepartment of Computer Science and Software Engineering , Swinburne University of \nTechnology, Hawthorn , VIC 3122, Australia , yxiang@swin.edu.au  \nSchool of Information Technology, Deakin University , Burwood, VIC 3125, Australia , \nemail: wanlei@deakin.edu.au  \nDepartment of Genetics, Harvard Medical School , Boston MA 02115, American , email:  \ndxiangmedsci@gmail.com \n \nBiographical notes: \nShigang Liu is a research fellow with School  of Software and Electr

In [8]:
text = ""

for i in range(len(pages)):
    text += pages[i].page_content

knowledge = document_processor(text)

query = "What is imbalanced data"

docs = knowledge.similarity_search(query)

In [21]:
text = ""
for doc in docs:
    text += str(doc)[13:]

In [23]:
print(str(text))

'been conducted before. Most i mportantly, data distribution has  never been considered in previous  imbalanced \nbiomedical data studies.   \nDifferent from other related work, the experimentation study in this paper mainly  focuses  on resampling and feature \nselection techniques  in class imbalance problem  with data distribution being considered as well . The main \ncontribution s of this paper are as  follows : (1) we have conducted  an extensive experiment  study and (2) the relationship  \nbetween  data distributions and different class imbalance  learning techniques have been discussed. Precisely, for the \nformer  contribution: f irstly, our study focuses  on recently developed and popular ly used sampling techniques . In the \nmeantime, considering that feature selection (FS) is also beneficial to imbalanced data learning, one of the recently \ndeveloped FS approach es is also employed in this study (Yu et al. 2014). Secondly, five classification algorithms have''A Study of 

In [26]:
import json
payload_dict = {"system_message" : 2, "user_message": 3, "max_tokens": 3, "context": 3}
        
json_payload = json.dumps(payload_dict)
json_payload

'{"system_message": 2, "user_message": 3, "max_tokens": 3, "context": 3}'

In [13]:
from pydantic import Extra
import requests
from typing import Any, List, Mapping, Optional

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

class LlamaLLM(LLM):
    llm_url = 'http://127.0.0.1:5000/invoke_llm'

    class Config:
        extra = Extra.forbid

    @property
    def _llm_type(self) -> str:
        return "Llama2 7B"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")

        payload = {
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "token": "abcdfejkwehr"
        }

        headers = {"Content-Type": "application/json"}

        response = requests.post(self.llm_url, json=payload, headers=headers, verify=False)
        response.raise_for_status()

        return response.json()['generated_text']  
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"llmUrl": self.llm_url}

In [14]:
llm = LlamaLLM()
chain = load_qa_chain(llm, chain_type='stuff')

In [9]:
from langchain.prompts import PromptTemplate
template = """[INST] <<SYS>>

Answer the question base on the context below.

<</SYS>>

Context: {context}
Question: {question}
Answer:
[/INST]"""

In [1]:
import json
payload = """{
  "system_message": "You are a helpful assistant",
  "user_message": "Generate the best five cities for a romantic trip",
  "max_tokens": 2000
}"""
json_payload = json.loads(str(payload))

print(json_payload)

{'system_message': 'You are a helpful assistant', 'user_message': 'Generate the best five cities for a romantic trip', 'max_tokens': 2000}


In [34]:
print(response.json()['choices'][0]['text'].split('[/INST]')[1])

  Certainly! Here are five romantic cities around the world that are perfect for a dreamy getaway with your significant other:

1. Paris, France - The City of Love: Paris is often referred to as the most romantic city in the world, and for good reason. From the Eiffel Tower to the Louvre, this city is brimming with iconic landmarks that will make your heart melt. Take a stroll along the Seine, visit the Notre-Dame Cathedral, and enjoy a picnic in the Luxembourg Gardens. Don't forget to indulge in some delicious French cuisine and wine to complete your experience.
2. Venice, Italy - A Gondola Ride to Remember: Venice is a city like no other, with its winding canals and charming bridges. Take a romantic gondola ride through the city's canals, holding hands with your partner and soaking in the stunning architecture and history. Don't miss the Rialto Bridge, St. Mark's Basilica, and the Grand Canal.
3. New York City, USA - The City That Never Sleeps: New York City is a bustling metropolis 

In [12]:
docs

[Document(page_content='been conducted before. Most i mportantly, data distribution has  never been considered in previous  imbalanced \nbiomedical data studies.   \nDifferent from other related work, the experimentation study in this paper mainly  focuses  on resampling and feature \nselection techniques  in class imbalance problem  with data distribution being considered as well . The main \ncontribution s of this paper are as  follows : (1) we have conducted  an extensive experiment  study and (2) the relationship  \nbetween  data distributions and different class imbalance  learning techniques have been discussed. Precisely, for the \nformer  contribution: f irstly, our study focuses  on recently developed and popular ly used sampling techniques . In the \nmeantime, considering that feature selection (FS) is also beneficial to imbalanced data learning, one of the recently \ndeveloped FS approach es is also employed in this study (Yu et al. 2014). Secondly, five classification algor

In [None]:
def main():
    st.title("Chat with your PDF 💬")
    
    pdf = st.file_uploader('Upload your PDF Document', type='pdf')
    
    if pdf is not None:
        pdf_reader = PdfReader(pdf)
        # Text variable will store the pdf text
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        
        # Create the knowledge base object
        knowledgeBase = process_text(text)
        
        query = st.text_input('Ask a question to the PDF')
        cancel_button = st.button('Cancel')
        
        if cancel_button:
            st.stop()
        
        if query:
            docs = knowledgeBase.similarity_search(query)

            llm = OpenAI()
            chain = load_qa_chain(llm, chain_type='stuff')
            
            with get_openai_callback() as cost:
                response = chain.run(input_documents=docs, question=query)
                print(cost)
                
            st.write(response)
            
            
if __name__ == "__main__":
    main()

In [27]:
docs

[Document(page_content='been conducted before. Most i mportantly, data distribution has  never been considered in previous  imbalanced \nbiomedical data studies.   \nDifferent from other related work, the experimentation study in this paper mainly  focuses  on resampling and feature \nselection techniques  in class imbalance problem  with data distribution being considered as well . The main \ncontribution s of this paper are as  follows : (1) we have conducted  an extensive experiment  study and (2) the relationship  \nbetween  data distributions and different class imbalance  learning techniques have been discussed. Precisely, for the \nformer  contribution: f irstly, our study focuses  on recently developed and popular ly used sampling techniques . In the \nmeantime, considering that feature selection (FS) is also beneficial to imbalanced data learning, one of the recently \ndeveloped FS approach es is also employed in this study (Yu et al. 2014). Secondly, five classification algor

In [None]:
query = "What is imbalanced data"
doc = docs.similarity_search(query)
chain = load_qa_chain()
