In [1]:
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import shutil
import gc
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext, load_index_from_storage

app = Flask(__name__)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configure the global settings
Settings.llm = HuggingFaceLLM(
    model_name="h2oai/h2o-danube2-1.8b-chat",
    tokenizer_name="h2oai/h2o-danube2-1.8b-chat"
)
Settings.embed_model = HuggingFaceEmbedding(model_name="avsolatorio/NoInstruct-small-Embedding-v0")
Settings.chunk_size = 1024

class LlamaIndexHelper:
    def __init__(self, directory_path, embedding_model, delete_existing_index=False):
        self.directory_path = directory_path
        self.embedding_model = embedding_model
        self.query_engine = self.construct_index(delete_existing_index)

    def load_documents(self):
        # Use the SimpleDirectoryReader to load documents from the specified directory
        return SimpleDirectoryReader(self.directory_path).load_data()

    def construct_index(self, delete_existing_index=False):
        persist_dir = 'index_dir'

        if delete_existing_index and os.path.exists(persist_dir):
            shutil.rmtree(persist_dir)

        os.makedirs(persist_dir, exist_ok=True)

        if os.path.exists(persist_dir + '/docstore.json'):
            storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
            if storage_context:
                index = load_index_from_storage(storage_context)
        else:
            documents = self.load_documents()
            index = VectorStoreIndex.from_documents(documents)
            index.storage_context.persist(persist_dir)

        return index.as_query_engine(streaming=True, similarity_top_k=3, verbose=True)

    def get_embedding(self, text):
        text = text.replace("\r", "")

        if len(text) == 0:
            return None
        return self.embedding_model.get_text_embedding(text)

    def reconstruct_index(self):
        self.query_engine = self.construct_index(delete_existing_index=True)

@app.route('/query', methods=['POST'])
def query():
    try:
        print("Received query request")
        data = request.json
        print("Parsed request data")
        query_text = data.get('query')
        print(f"Query text: {query_text}")
        
        if not query_text:
            print("No query provided")
            return jsonify({'error': 'No query provided'}), 400
        
        # Use the query engine to get results
        print("Querying the index")
        results = llama_helper.query_engine.query(query_text)
        print(f"Type of query results: {type(results)}")

        # Process the streaming response
        response_data = []
        for chunk in results.response_gen:
            response_data.append(chunk)

        print(f"Response data: {response_data}")

        return jsonify({'results': response_data})
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500

@app.route('/reconstruct_index', methods=['POST'])
def reconstruct_index():
    try:
        llama_helper.reconstruct_index()
        return jsonify({'message': 'Index successfully reconstructed.'}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500

def unload_llm_model():
    global llm_model
    print("Unloading LLM model...")
    del llm_model
    gc.collect()
    torch.cuda.empty_cache()
    print("LLM model unloaded.")

llama_helper = LlamaIndexHelper(directory_path='Knowledge Base', embedding_model=Settings.embed_model)

if __name__ == '__main__':
    app.run(port=5002)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
[33m * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.[0m


 * Serving Flask app '__main__'
 * Debug mode: off


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


AttributeError: 'tuple' object has no attribute 'tb_frame'