In [None]:
!pip install PyPDF2 pandas langchain openai pinecone-client

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [

In [None]:
import PyPDF2
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def parse_pnl_table(text):
    """
    Parse P&L table from extracted text.
    """
    lines = text.split("\n")
    pnl_data = []
    for line in lines:
        if any(keyword in line.lower() for keyword in ["revenue", "expenses", "profit"]):
            # Only extract the first 6 elements to match the expected columns
            pnl_data.append(line.split()[:6])

    # Convert to DataFrame
    df = pd.DataFrame(pnl_data, columns=["Metric", "Q1", "Q2", "Q3", "Q4", "Year"])
    return df

# Upload and parse the PDF
from google.colab import files
uploaded = files.upload()  # Upload your PDF
pdf_path = list(uploaded.keys())[0]

text = extract_text_from_pdf(pdf_path)
pnl_table = parse_pnl_table(text)
print(pnl_table)


In [None]:
pip install faiss-cpu

In [None]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast embedding model

# Example DataFrame (replace with your pnl_table)
data = {
    "Metric": ["Revenue", "Expenses", "Profit"],
    "Q1": [100, 50, 50],
    "Q2": [200, 100, 100],
    "Q3": [300, 150, 150],
    "Q4": [400, 200, 200],
    "Year": [1000, 500, 500],
}
pnl_table = pd.DataFrame(data)

# Prepare data for FAISS
texts = [
    f"{row['Metric']} Q1: {row['Q1']}, Q2: {row['Q2']}, Q3: {row['Q3']}, Q4: {row['Q4']}, Year: {row['Year']}"
    for _, row in pnl_table.iterrows()
]
vectors = np.array([model.encode(text) for text in texts]).astype("float32")

# Create FAISS index
dimension = vectors.shape[1]  # Dimensionality of embeddings
index = faiss.IndexFlatL2(dimension)  # L2 similarity (Euclidean distance)
index.add(vectors)

# Query FAISS
query = "What is the revenue for Q3?"
query_vector = np.array(model.encode(query)).astype("float32").reshape(1, -1)
distances, indices = index.search(query_vector, k=1)  # Retrieve top 1 match

# Display result
print("Closest match:", texts[indices[0][0]])
print("Distance:", distances[0][0])


In [None]:
# Prepare data for FAISS from parsed P&L table
texts = [
    f"{row['Metric']} Q1: {row['Q1']}, Q2: {row['Q2']}, Q3: {row['Q3']}, Q4: {row['Q4']}, Year: {row['Year']}"
    for _, row in pnl_table.iterrows()
]
vectors = np.array([model.encode(text) for text in texts]).astype("float32")

# Create FAISS index
dimension = vectors.shape[1]  # Dimensionality of embeddings
index = faiss.IndexFlatL2(dimension)  # L2 similarity (Euclidean distance)
index.add(vectors)

print("FAISS index created with embeddings from the uploaded PDF.")


In [None]:
while True:
    query = input("Enter your financial query (or type 'exit' to quit): ")
    if query.lower() == "exit":
        break

    # Encode the query and search FAISS
    query_vector = np.array(model.encode(query)).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, k=1)

    # Display result
    print("\nClosest match:", texts[indices[0][0]])
    print("Similarity score:", distances[0][0])
    print("-" * 50)


In [None]:
!pip install streamlit
import streamlit as st
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader

# Initialize FAISS and Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
dimension = 384
index = faiss.IndexFlatL2(dimension)

# Functions
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

def parse_pnl_table(text):
    lines = text.split("\n")
    pnl_data = []
    for line in lines:
        if any(keyword in line.lower() for keyword in ["revenue", "expenses", "profit"]):
            pnl_data.append(line.split()[:6])
    return pd.DataFrame(pnl_data, columns=["Metric", "Q1", "Q2", "Q3", "Q4", "Year"])

def store_embeddings_in_faiss(df):
    texts = [
        f"{row['Metric']} Q1: {row['Q1']}, Q2: {row['Q2']}, Q3: {row['Q3']}, Q4: {row['Q4']}, Year: {row['Year']}"
        for _, row in df.iterrows()
    ]
    vectors = np.array([model.encode(text) for text in texts]).astype("float32")
    index.add(vectors)
    return texts

def query_faiss(query, texts):
    query_vector = np.array(model.encode(query)).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, k=1)
    return texts[indices[0][0]], distances[0][0]

# Streamlit App
st.title("Financial QA Bot")

# File Upload
uploaded_file = st.file_uploader("Upload P&L PDF", type="pdf")

if uploaded_file:
    text = extract_text_from_pdf(uploaded_file)
    pnl_table = parse_pnl_table(text)
    st.write("Parsed P&L Data:")
    st.dataframe(pnl_table)

    # Store embeddings
    texts = store_embeddings_in_faiss(pnl_table)
    st.success("Embeddings stored successfully!")

    # Query Input
    query = st.text_input("Ask a financial question:")
    if query:
        result, distance = query_faiss(query, texts)
        st.write("Closest Match:", result)
        st.write("Similarity Score:", distance)


In [None]:
from transformers import pipeline
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load models
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')  # For embeddings
generation_model = pipeline("text-generation", model="distilgpt2")  # For response generation

# FAISS index setup
dimension = 384
index = faiss.IndexFlatL2(dimension)

def store_embeddings_in_faiss(df):
    """
    Store embeddings for P&L data in FAISS.
    """
    texts = [
        f"{row['Metric']} Q1: {row['Q1']}, Q2: {row['Q2']}, Q3: {row['Q3']}, Q4: {row['Q4']}, Year: {row['Year']}"
        for _, row in df.iterrows()
    ]
    vectors = np.array([retrieval_model.encode(text) for text in texts]).astype("float32")
    index.add(vectors)
    return texts

def query_rag(query, texts):
    """
    Perform RAG: Retrieve and generate a response.
    """
    # Step 1: Retrieve relevant data
    query_vector = np.array(retrieval_model.encode(query)).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, k=1)
    retrieved_text = texts[indices[0][0]]

    # Step 2: Generate response
    input_prompt = f"Query: {query}\nRetrieved Information: {retrieved_text}\nResponse:"
    response = generation_model(input_prompt, max_length=50, num_return_sequences=1)
    return response[0]["generated_text"], retrieved_text


In [None]:
import streamlit as st
import pandas as pd

st.title("Financial QA Bot with RAG")

# File Upload
uploaded_file = st.file_uploader("Upload P&L PDF", type="pdf")

if uploaded_file:
    # Parse and store embeddings
    text = extract_text_from_pdf(uploaded_file)
    pnl_table = parse_pnl_table(text)
    st.write("Parsed P&L Data:")
    st.dataframe(pnl_table)

    texts = store_embeddings_in_faiss(pnl_table)
    st.success("Embeddings stored successfully!")

    # Query Input
    query = st.text_input("Ask a financial question:")
    if query:
        response, retrieved_text = query_rag(query, texts)
        st.write("Retrieved Information:", retrieved_text)
        st.write("Generated Response:", response)


In [None]:
!pip install streamlit pyngrok

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize models
model = SentenceTransformer('all-MiniLM-L6-v2')
dimension = 384
index = faiss.IndexFlatL2(dimension)

def store_embeddings_in_faiss(df):
    texts = [
        f"{row['Metric']} Q1: {row['Q1']}, Q2: {row['Q2']}, Q3: {row['Q3']}, Q4: {row['Q4']}, Year: {row['Year']}"
        for _, row in df.iterrows()
    ]
    vectors = np.array([model.encode(text) for text in texts]).astype("float32")
    index.add(vectors)
    return texts

def query_faiss(query, texts):
    query_vector = np.array(model.encode(query)).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, k=1)
    return texts[indices[0][0]], distances[0][0]

st.title("Financial QA Bot via Colab")

# File Upload
uploaded_file = st.file_uploader("Upload P&L PDF", type="pdf")
if uploaded_file:
    from PyPDF2 import PdfReader
    reader = PdfReader(uploaded_file)
    text = ''.join([page.extract_text() for page in reader.pages])

    # Parse the text into a DataFrame
    lines = text.split("\n")
    pnl_data = [
        line.split()[:6] for line in lines if any(keyword in line.lower() for keyword in ["revenue", "expenses", "profit"])
    ]
    pnl_table = pd.DataFrame(pnl_data, columns=["Metric", "Q1", "Q2", "Q3", "Q4", "Year"])
    st.write("Parsed P&L Data:")
    st.dataframe(pnl_table)

    # Store embeddings
    texts = store_embeddings_in_faiss(pnl_table)
    st.success("Embeddings stored successfully!")

    # Query Input
    query = st.text_input("Ask a financial question:")
    if query:
        result, distance = query_faiss(query, texts)
        st.write("Closest Match:", result)
        st.write("Similarity Score:", distance)


In [None]:
# Start Streamlit app
!streamlit run app.py &>/dev/null&

# Expose Streamlit app to the web
!ssh -o StrictHostKeyChecking=no -R 80:localhost:8501 serveo.net
