In [66]:
!pip install llama-parse llama-index-vector-stores-qdrant qdrant_client python-dotenv python-dotenv llama-index-llms-groq



In [67]:
pip install llama-index-embeddings-nomic gradio



In [68]:
import os
import nest_asyncio
nest_asyncio.apply()

from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
import qdrant_client

from dotenv import load_dotenv

load_dotenv()

from llama_parse import LlamaParse

llamaparse_api_key = ""

import pickle

In [70]:
import os
import pathlib
import pandas as pd
import pickle

def load_or_parse_data():
    data_file = "./data/parsed_data.pkl"

    # Check if parsed data file exists
    if os.path.exists(data_file):
        modification_time_of_parsed_data = os.stat(data_file).st_mtime
    else:
        modification_time_of_parsed_data = 0

    parse_data = False
    docs_to_be_parsed_by_llamaparser = []

    # Check if any files in the data directory have been modified since last parsed data
    if os.path.exists('./data') and os.listdir('./data'):  # Check if directory is not empty
        for file in os.listdir('./data'):
            modification_time_of_file = os.stat(f'./data/{file}').st_mtime
            if modification_time_of_file > modification_time_of_parsed_data:
                parse_data = True
                break

        # Collect documents to be parsed
        for file in os.listdir('./data'):
            file_type = pathlib.Path(file).suffix
            if file_type in ['.pdf', '.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf', '.wps', '.wpd', '.sxw', '.stw', '.sxg', '.pages', '.mw', '.mcw', '.uot', '.uof', '.uos', '.uop', '.ppt', '.pptx', '.pot', '.pptm', '.potx', '.potm', '.key', '.odp', '.odg', '.otp', '.fopd', '.sxi', '.sti', '.epub', '.html', '.htm']:
                docs_to_be_parsed_by_llamaparser.append(f'./data/{file}')

    if not parse_data:
        print("Loading parsed data")
        # Check if parsed data file exists and load it
        if os.path.exists(data_file):
            with open(data_file, "rb") as f:
                parsed_data = pickle.load(f)
        else:
            parsed_data = []
    else:
        print("Parsing data")

        if docs_to_be_parsed_by_llamaparser:
            # Assuming LlamaParse is properly defined and the API key is set
            llama_parse_documents = LlamaParse(api_key=llamaparse_api_key, result_type="markdown").load_data(docs_to_be_parsed_by_llamaparser)
            print(f"Parsed {len(llama_parse_documents)} documents")

            with open(data_file, "wb") as f:
                pickle.dump(llama_parse_documents, f)

            parsed_data = llama_parse_documents
        else:
            print("No documents found to parse.")
            parsed_data = []

        print("\n\n\n\n\n", parsed_data)

    if parsed_data:
        print(f"Loaded {len(parsed_data)} documents")
    else:
        print("No documents loaded or parsed.")

    return parsed_data


In [71]:
llama_parse_documents = load_or_parse_data()
print(len(llama_parse_documents))

Loading parsed data
Loaded 1 documents
1


In [72]:
from dotenv import load_dotenv



qdrant_url = ""
qdrant_api_key = ""


https://2b906518-0931-4736-b684-4a5620f272c9.eu-central-1-0.aws.cloud.qdrant.io:6333 eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxODMxOTE0Mzg3fQ.borTxX8OVr5d_zigy2z1CyXKjm7zZ4q8hdHf9zH-m_Q


In [73]:
from llama_index.embeddings.nomic import NomicEmbedding


embed_model = NomicEmbedding(
    api_key="",
    dimensionality=128,
    model_name="nomic-embed-text-v1.5",
)


None


In [74]:
from llama_index.core import Settings
Settings.embed_model = embed_model


In [75]:
from llama_index.llms.groq import Groq
groq_api_key = ""

llm = Groq(model="gemma2-9b-it", api_key=groq_api_key, stream=True)

In [76]:
Settings.llm = llm



client = qdrant_client.QdrantClient(api_key=qdrant_api_key, url=qdrant_url)

eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxODMxOTE0Mzg3fQ.borTxX8OVr5d_zigy2z1CyXKjm7zZ4q8hdHf9zH-m_Q
https://2b906518-0931-4736-b684-4a5620f272c9.eu-central-1-0.aws.cloud.qdrant.io:6333
<qdrant_client.qdrant_client.QdrantClient object at 0x783a45651f10>
collections=[CollectionDescription(name='rag_documents_collection')]


In [77]:
import uuid
collection_name = "rag_documents_collection"
try:
    collections = client.get_collections()
    collection_exists = any(col.name == collection_name for col in collections.collections)

    if not collection_exists:
        # Create the collection if it doesn't exist
        client.create_collection(
            collection_name=collection_name,
            vectors_config={"size": 128, "distance": "Cosine"}
        )
        print(f"Created new collection: {collection_name}")
    else:
        print(f"Collection already exists: {collection_name}")
    vector_store = QdrantVectorStore(client=client, collection_name=str(collection_name))
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents=llama_parse_documents, storage_context=storage_context)
except Exception as e:
    print(e)

Collection already exists: rag_documents_collection


In [78]:
query_engine = index.as_query_engine()
print(query_engine)

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x783a45b8c610>


In [79]:
! pip install gradio



In [None]:
import os
import gradio as gr
import shutil
from dotenv import load_dotenv



def move_file(file_path, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    print(f"File path: {file_path}")
    print(f"Destination: {destination_folder}")

    # Move the file
    destination_path = os.path.join(destination_folder, os.path.basename(file_path))
    print(f"Destination path: {destination_path}")

    shutil.move(file_path, destination_path)
    print(f"Moved {file_path} to {destination_path}")
    return destination_path

def predict(message, history):
        print("Message received:")
    print(message)

    # Handle files properly with type checking
    if message.get("files"):
        print(f"Files found: {len(message['files'])}")
        for i, file in enumerate(message["files"]):
            print(f"File {i}: {file}")
            print(f"File type: {type(file)}")

            try:
                # Handle different types of file objects
                if isinstance(file, dict) and 'path' in file:
                    # If file is a dictionary with a path key
                    move_file(file['path'], './data/')
                elif isinstance(file, str):
                    # If file is a string (direct path)
                    move_file(file, './data/')
                else:
                    # Try to adapt to other file object types
                    print(f"Unexpected file format: {type(file)}")
                    if hasattr(file, 'name'):
                        move_file(file.name, './data/')
            except Exception as e:
                print(f"Error processing file: {e}")

    # Import here to avoid issues with circular imports
    # Query the engine and return the response
    response = query_engine.query(message['text'])
    print(response, type(response))
    try:
      print(response.get_formatted_sources())
      print(response.get_formatted_sources().split("\n"))
      print(response.get_text())
    except:
      print(response)
    return str(response)

# Create the Gradio interface
demo = gr.ChatInterface(
    fn=predict,
    examples=[
        {"text": "Explain the abstract of the paper", "files": []},
        {"text": "What is the dataset", "files": []}
    ],
    title="AI Expert System",
        type="messages",
    multimodal=True
)

# Launch the interface
if __name__ == "__main__":
    demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c213a3a24d2287689b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Message received:
{'text': 'Explain the abstract of the paper', 'files': []}
Please provide the abstract of the paper you would like me to explain. 
 <class 'llama_index.core.base.response.schema.Response'>
> Source (Doc id: 52803bf3-b5ad-4a91-a526-f05e38d768f9): # Satwik Panda

NSM School Road, Patamata, Vijayawada, Andhra Pradesh

9963991966   |   satwik990...
['> Source (Doc id: 52803bf3-b5ad-4a91-a526-f05e38d768f9): # Satwik Panda', '', 'NSM School Road, Patamata, Vijayawada, Andhra Pradesh', '', '9963991966   |   satwik990...']
Please provide the abstract of the paper you would like me to explain. 

Message received:
{'text': 'Where did satwik do his engineering', 'files': []}
Velagapudi Ramakrishna Siddhartha Engineering College  
 <class 'llama_index.core.base.response.schema.Response'>
> Source (Doc id: 52803bf3-b5ad-4a91-a526-f05e38d768f9): # Satwik Panda

NSM School Road, Patamata, Vijayawada, Andhra Pradesh

9963991966   |   satwik990...
['> Source (Doc id: 52803bf3-b5ad-4a9