In [None]:
import streamlit as st
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import re
import numpy as np
from typing import List, Dict, Any
import os
import dateutil.parser
from sklearn.preprocessing import MinMaxScaler

class FinancialReportAnalyzer:
    def __init__(self):
        # Enhanced financial metric extraction patterns
        self.financial_patterns = {
            'revenue': [
                r'(?:Total\s*)?[Rr]evenue[:]*\s*\$?([\d,.]+)(?:\s*million|\s*billion)?',
                r'[Rr]evenue\s*(?:for\s*the\s*period)?\s*[:]*\s*\$?([\d,.]+)(?:\s*million|\s*billion)?'
            ],
            'net_income': [
                r'(?:Net\s*)?[Ii]ncome[:]*\s*\$?([\d,.]+)(?:\s*million|\s*billion)?',
                r'(?:Net\s*)?[Pp]rofit[:]*\s*\$?([\d,.]+)(?:\s*million|\s*billion)?'
            ],
            'eps': [
                r'[Ee]arnings\s*[Pp]er\s*[Ss]hare[:]*\s*\$?([\d,.]+)',
                r'EPS[:]*\s*\$?([\d,.]+)'
            ],
            'operating_expenses': [
                r'[Oo]perating\s*[Ee]xpenses[:]*\s*\$?([\d,.]+)(?:\s*million|\s*billion)?'
            ]
        }
    
    def extract_financial_metrics(self, text: str) -> Dict[str, float]:
        """
        Robust financial metric extraction using multiple regex patterns
        """
        metrics = {}
        
        for metric, patterns in self.financial_patterns.items():
            for pattern in patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    # Clean and convert to float
                    value = match.group(1).replace(',', '')
                    try:
                        metrics[metric] = float(value)
                        break  # Stop after first successful extraction
                    except ValueError:
                        continue
        
        return metrics

    def parse_quarterly_data(self, documents: List[Any]) -> pd.DataFrame:
        """
        Parse multiple documents to create a quarterly trend dataframe
        """
        quarterly_data = []
        
        for doc in documents:
            # Extract text and metrics
            text = doc.page_content
            metrics = self.extract_financial_metrics(text)
            
            # Try to extract quarter from text
            quarter_match = re.search(r'(\w+)\s*(?:Quarter|Q)\s*(?:Ended|Report)\s*(\d{4})', text)
            if quarter_match:
                quarter = f"{quarter_match.group(1)} {quarter_match.group(2)}"
                
                # Add extracted metrics to quarterly data
                quarterly_metrics = {
                    'Quarter': quarter,
                    **metrics
                }
                quarterly_data.append(quarterly_metrics)
        
        return pd.DataFrame(quarterly_data)

    def visualize_financial_trends(self, df: pd.DataFrame):
        """
        Create advanced visualizations for financial trends
        """
        # Normalize data for comparison
        scaler = MinMaxScaler()
        normalized_columns = ['revenue', 'net_income', 'eps']
        
        # Prevent errors if not enough data
        if len(df) < 2:
            st.warning("Not enough data for trend analysis")
            return None
        
        # Create subplots for different visualizations
        fig = go.Figure()
        
        # Line chart for key metrics
        for metric in normalized_columns:
            if metric in df.columns:
                normalized_data = scaler.fit_transform(df[[metric]])
                fig.add_trace(go.Scatter(
                    x=df['Quarter'], 
                    y=normalized_data.flatten(), 
                    mode='lines+markers',
                    name=metric.replace('_', ' ').title()
                ))
        
        fig.update_layout(
            title='Normalized Financial Metrics Trends',
            xaxis_title='Quarter',
            yaxis_title='Normalized Value',
            height=600
        )
        
        # Comparative bar chart
        fig2 = go.Figure()
        for metric in normalized_columns:
            if metric in df.columns:
                fig2.add_trace(go.Bar(
                    x=df['Quarter'], 
                    y=df[metric], 
                    name=metric.replace('_', ' ').title()
                ))
        
        fig2.update_layout(
            title='Quarterly Financial Metrics Comparison',
            xaxis_title='Quarter',
            yaxis_title='Value',
            barmode='group',
            height=600
        )
        
        return [fig, fig2]

def main():
    st.title("Advanced Financial Report Analysis System")
    
    # Initialize analyzer
    analyzer = FinancialReportAnalyzer()
    
    # File upload with multiple file support
    uploaded_files = st.file_uploader(
        "Upload Financial Reports (PDFs)", 
        type="pdf", 
        accept_multiple_files=True
    )
    
    if uploaded_files:
        all_documents = []
        
        # Process each uploaded file
        for uploaded_file in uploaded_files:
            with st.spinner(f"Processing {uploaded_file.name}..."):
                # Save uploaded file temporarily
                with open("temp.pdf", "wb") as f:
                    f.write(uploaded_file.getvalue())
                
                # Load PDF
                loader = PyPDFLoader("temp.pdf")
                documents = loader.load()
                
                # Text splitting
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=200
                )
                split_docs = text_splitter.split_documents(documents)
                
                all_documents.extend(split_docs)
                
                # Remove temporary file
                os.remove("temp.pdf")
        
        # Analyze quarterly trends
        quarterly_df = analyzer.parse_quarterly_data(all_documents)
        
        # Display DataFrame
        st.subheader("Extracted Quarterly Financial Data")
        st.dataframe(quarterly_df)
        
        # Visualize trends
        trend_charts = analyzer.visualize_financial_trends(quarterly_df)
        
        if trend_charts:
            st.subheader("Financial Trends Visualization")
            for chart in trend_charts:
                st.plotly_chart(chart)
        
        # Comparative analysis
        st.subheader("Comparative Financial Analysis")
        metric_comparisons = {
            'Revenue Growth': quarterly_df['revenue'].pct_change() * 100,
            'Net Income Growth': quarterly_df['net_income'].pct_change() * 100
        }
        
        growth_df = pd.DataFrame(metric_comparisons)
        st.dataframe(growth_df)
        
        # Detailed query interface
        st.subheader("Detailed Financial Insights")
        query = st.text_input("Ask a detailed question about the financial reports:")
        
        if query:
            with st.spinner("Generating insights..."):
                # Here you would integrate an LLM for detailed querying
                # Placeholder for LLM-based insights generation
                st.write("Insights generation placeholder")

if __name__ == "__main__":
    main()