In [21]:
pip install openai==0.28

Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.47.0
    Uninstalling openai-1.47.0:
      Successfully uninstalled openai-1.47.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.25 requires openai<2.0.0,>=1.40.0, but you have openai 0.28.0 which is incompatible.
ragas 0.1.20 requires openai>1, but you have openai 0.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
from neo4j import GraphDatabase
import json
from openai import OpenAI
import numpy as np
from typing import List
import os
import openai

# Neo4j connection details
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "12345678"

client = OpenAI(api_key="")


def embed_text(text: str) -> List[float]:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding


def add_data_to_neo4j(tx, pre_text, post_text, table, id, pre_text_embedding, post_text_embedding):
    query = """
    CREATE (d:Document {id: $id})
    CREATE (p:PreText {content: $pre_text, embedding: $pre_text_embedding})
    CREATE (po:PostText {content: $post_text, embedding: $post_text_embedding})
    CREATE (t:Table {content: $table})
    CREATE (i:ID {value: $id})
    CREATE (d)-[:HAS_PRE_TEXT]->(p)
    CREATE (d)-[:HAS_POST_TEXT]->(po)
    CREATE (d)-[:HAS_TABLE]->(t)
    CREATE (d)-[:HAS_ID]->(i)
    """
    tx.run(query, 
           id=id, 
           pre_text=pre_text, 
           post_text=post_text, 
           table=json.dumps(table),
           pre_text_embedding=pre_text_embedding,
           post_text_embedding=post_text_embedding
    )

def process_json_file(file_path):
    with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as driver:
        with driver.session() as session:
            with open(file_path, 'r') as file:
                data_list = json.load(file)
                
                for data in data_list:
                    pre_text = " ".join(data['pre_text'])
                    post_text = " ".join(data['post_text'])
                    table = data['table']
                    id = data['id']
                    
                    # Get embeddings
                    pre_text_embedding = embed_text(pre_text)
                    post_text_embedding = embed_text(post_text)
                    
                    # Convert numpy arrays to lists for Neo4j compatibility
                    pre_text_embedding = np.array(pre_text_embedding).tolist()
                    post_text_embedding = np.array(post_text_embedding).tolist()
                    
                    session.execute_write(add_data_to_neo4j, 
                                          pre_text, 
                                          post_text, 
                                          table, 
                                          id, 
                                          pre_text_embedding, 
                                          post_text_embedding)
                    print(f"Added document with ID: {id}")

# Usage
process_json_file('convfinqa_rag/data/train.json')

Added document with ID: Single_JKHY/2009/page_28.pdf-3
Added document with ID: Single_RSG/2008/page_114.pdf-2
Added document with ID: Single_AAPL/2002/page_23.pdf-1
Added document with ID: Single_UPS/2009/page_33.pdf-2
Added document with ID: Double_UPS/2009/page_33.pdf
Added document with ID: Single_CE/2010/page_134.pdf-2
Added document with ID: Single_JPM/2013/page_104.pdf-2
Added document with ID: Double_MAS/2012/page_92.pdf
Added document with ID: Single_HIG/2004/page_122.pdf-2
Added document with ID: Single_SLG/2013/page_133.pdf-4
Added document with ID: Double_AES/2016/page_98.pdf
Added document with ID: Single_ETR/2008/page_336.pdf-3
Added document with ID: Single_BKR/2017/page_105.pdf-2
Added document with ID: Double_AES/2011/page_230.pdf
Added document with ID: Single_JPM/2016/page_73.pdf-4
Added document with ID: Double_ADBE/2018/page_66.pdf
Added document with ID: Double_PNC/2014/page_99.pdf
Added document with ID: Single_BLK/2017/page_35.pdf-4
Added document with ID: Single

In [4]:
import os
import openai
import numpy as np
import faiss
import hashlib
import pickle
import logging
from datetime import datetime
from neo4j import GraphDatabase
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_random_exponential
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load environment variables and set up logging
load_dotenv()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Neo4jHandler:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_all_documents(self):
        with self.driver.session() as session:
            result = session.read_transaction(self._get_all_documents)
            return result

    @staticmethod
    def _get_all_documents(tx):
        query = """
        MATCH (d:Document)
        MATCH (d)-[:HAS_PRE_TEXT]->(p:PreText)
        MATCH (d)-[:HAS_POST_TEXT]->(po:PostText)
        OPTIONAL MATCH (d)-[:HAS_TABLE]->(t:Table)
        RETURN d.id AS id, p.content AS pre_text, po.content AS post_text, 
               p.embedding AS emb_pre, po.embedding AS emb_post, 
               collect(t.content) AS tables
        """
        result = tx.run(query)
        return [(record["id"], record["pre_text"], record["post_text"], 
                 record["emb_pre"], record["emb_post"], record["tables"]) 
                for record in result]

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def get_embedding_with_retry(text, model="text-embedding-3-small"):
    try:
        response = openai.Embedding.create(input=[text], model=model)
        return response['data'][0]['embedding']
    except Exception as e:
        logger.error(f"Error getting embedding: {e}")
        raise

def get_embedding_with_cache(text, model="text-embedding-3-small"):
    text_hash = hashlib.md5(text.encode()).hexdigest()
    cache_file = f"embedding_cache/{text_hash}.pkl"

    try:
        with open(cache_file, "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        embedding = get_embedding_with_retry(text, model)
        
        os.makedirs("embedding_cache", exist_ok=True)
        with open(cache_file, "wb") as f:
            pickle.dump(embedding, f)
        
        return embedding

def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

def process_documents_in_batches(documents, batch_size=1000):
    for i in range(0, len(documents), batch_size):
        yield documents[i:i+batch_size]

def index_embeddings_batched(documents):
    emb_pre_list = []
    emb_post_list = []
    
    for batch in process_documents_in_batches(documents):
        emb_pre = np.array([doc[3] for doc in batch]).astype('float32')
        emb_post = np.array([doc[4] for doc in batch]).astype('float32')
        emb_pre = normalize_embeddings(emb_pre)
        emb_post = normalize_embeddings(emb_post)
        emb_pre_list.append(emb_pre)
        emb_post_list.append(emb_post)
    
    emb_pre_all = np.concatenate(emb_pre_list)
    emb_post_all = np.concatenate(emb_post_list)
    
    index_pre = faiss.IndexFlatIP(emb_pre_all.shape[1])
    index_post = faiss.IndexFlatIP(emb_post_all.shape[1])
    index_pre.add(emb_pre_all)
    index_post.add(emb_post_all)
    return index_pre, index_post

def find_similar_documents(query, index_pre, index_post, documents, top_k=20):
    query_embedding = np.array(get_embedding_with_cache(query)).astype('float32').reshape(1, -1)
    query_embedding = normalize_embeddings(query_embedding)
    distances_pre, indices_pre = index_pre.search(query_embedding, top_k)
    distances_post, indices_post = index_post.search(query_embedding, top_k)
    
    combined_results = {}
    for i in range(top_k):
        doc_id_pre = indices_pre[0][i]
        doc_id_post = indices_post[0][i]
        combined_results[doc_id_pre] = combined_results.get(doc_id_pre, 0) + distances_pre[0][i]
        combined_results[doc_id_post] = combined_results.get(doc_id_post, 0) + distances_post[0][i]
    
    sorted_results = sorted(combined_results.items(), key=lambda item: item[1], reverse=True)[:top_k]
    return [(documents[i][0], documents[i][1], documents[i][2], documents[i][5], score) for i, score in sorted_results]

def re_rank_documents_parallel(question, documents, max_workers=5):
    llm = ChatOpenAI(api_key=os.getenv('OPENAI_API_KEY'), model="gpt-4o-mini", temperature=0)
    prompt_template = PromptTemplate(
        input_variables=["question", "document"],
        template="Question: {question}\n\nDocument: {document}\n\nOn a scale of 0 to 10, how relevant is this document to the question? Provide only a numerical score:"
    )
    rank_chain = LLMChain(llm=llm, prompt=prompt_template)

    def process_document(doc):
        doc_id, pre_text, post_text, tables, score = doc
        document_text = f"Pre Text: {pre_text}\nPost Text: {post_text}\nTables: {tables}"
        relevance_score = rank_chain.run({"question": question, "document": document_text})
        try:
            relevance_score = float(relevance_score.strip())
        except ValueError:
            relevance_score = 0.0
        return (doc_id, pre_text, post_text, tables, relevance_score)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_doc = {executor.submit(process_document, doc): doc for doc in documents}
        ranked_docs = []
        for future in as_completed(future_to_doc):
            ranked_docs.append(future.result())

    ranked_docs = sorted(ranked_docs, key=lambda x: x[4], reverse=True)
    return ranked_docs[:10]

def prepare_context(top_docs, max_tokens=10000):
    context = ""
    current_tokens = 0
    for doc_id, pre_text, post_text, tables, score in top_docs:
        doc_context = f"Document ID: {doc_id}\nPre Text: {pre_text}\nPost Text: {post_text}\nTables: {tables}\n\n"
        doc_tokens = len(doc_context.split())
        if current_tokens + doc_tokens > max_tokens:
            break
        context += doc_context
        current_tokens += doc_tokens  
    return context



def answer_question_with_context(question, context):
    llm = ChatOpenAI(api_key=os.getenv('OPENAI_API_KEY'), model="gpt-4o-mini", temperature=0)

    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""
Context: {context}

Given the context, {question} Report your answer using the following format:

Formatted answer: Float number to two decimal point precision 

Answer:
"""
    )

    qa_chain = LLMChain(llm=llm, prompt=prompt_template)

    answer = qa_chain.run({"context": context, "question": question})
    return answer

def qa_system(question):
    logger.info(f"Processing question: {question}")
    
    neo4j_handler = Neo4jHandler("bolt://localhost:7687", "neo4j", "12345678")
    openai.api_key = os.getenv('OPENAI_API_KEY')

    documents = neo4j_handler.get_all_documents()
    logger.info(f"Retrieved {len(documents)} documents from Neo4j")

    index_pre, index_post = index_embeddings_batched(documents)
    logger.info("Indexed embeddings using FAISS")

    similar_docs = find_similar_documents(question, index_pre, index_post, documents)
    logger.info(f"Found {len(similar_docs)} similar documents")

    top_docs = re_rank_documents_parallel(question, similar_docs)
    logger.info(f"Re-ranked top 5 documents: {[doc[0] for doc in top_docs]}")

    context = prepare_context(top_docs)
    logger.info(f"Prepared context with {len(context.split())} words")

    answer = answer_question_with_context(question, context)
    logger.info("Generated answer using GPT-4o-mini")

    neo4j_handler.close()

    return context, answer

if __name__ == "__main__":
    question = "What was the percentage change in the net cash from operating activities from 2008 to 2009?"
    context, answer = qa_system(question)
    print(f"Question: {question}\n")
    print(answer)

2024-09-23 12:01:59,115 - INFO - Processing question: What was the percentage change in the net cash from operating activities from 2008 to 2009?
  result = session.read_transaction(self._get_all_documents)
2024-09-23 12:02:03,857 - INFO - Retrieved 3037 documents from Neo4j
2024-09-23 12:02:04,161 - INFO - Indexed embeddings using FAISS
2024-09-23 12:02:04,165 - INFO - Found 20 similar documents
2024-09-23 12:02:06,748 - INFO - Re-ranked top 5 documents: ['Single_UNP/2009/page_38.pdf-3', 'Single_UNP/2009/page_38.pdf-4', 'Single_UNP/2009/page_38.pdf-2', 'Single_SWKS/2010/page_105.pdf-3', 'Single_SWKS/2010/page_105.pdf-1', 'Single_SWKS/2010/page_105.pdf-2', 'Double_UNP/2011/page_35.pdf', 'Single_GIS/2018/page_39.pdf-1', 'Double_GIS/2018/page_39.pdf', 'Single_GIS/2018/page_39.pdf-2']
2024-09-23 12:02:06,751 - INFO - Prepared context with 5211 words
2024-09-23 12:02:11,037 - INFO - Generated answer using GPT-4o-mini


Question: What was the percentage change in the net cash from operating activities from 2008 to 2009?

To calculate the percentage change in net cash from operating activities from 2008 to 2009, we can use the formula:

\[
\text{Percentage Change} = \left( \frac{\text{New Value} - \text{Old Value}}{\text{Old Value}} \right) \times 100
\]

From the provided data:
- Cash provided by operating activities in 2008: $4,070 million
- Cash provided by operating activities in 2009: $3,234 million

Now, substituting the values into the formula:

\[
\text{Percentage Change} = \left( \frac{3,234 - 4,070}{4,070} \right) \times 100
\]

Calculating the difference:

\[
3,234 - 4,070 = -836
\]

Now, substituting back into the formula:

\[
\text{Percentage Change} = \left( \frac{-836}{4,070} \right) \times 100 \approx -20.54
\]

Thus, the percentage change in net cash from operating activities from 2008 to 2009 is approximately -20.54%.

Formatted answer: -20.54
