In [1]:
from swarm import Swarm, Agent
import base64
import os
import csv

# Initialize Swarm client
client = Swarm()

def transfer_to_extractor():
    """Transfer control to the data extractor agent"""
    return data_extractor

# Create the data extractor agent first (since it's referenced in transfer function)
data_extractor = Agent(
    name="Data Extractor",
    model="gpt-4o-mini",
    instructions="""You are an expert at extracting EXACT data from fund factsheets.
    
    For each numbered question:
    1. Find the exact data in the factsheet
    2. Format as: X. Answer: <label,value> or <label1,value1>,<label2,value2>,...
    3. Keep the same question numbers
    
    Example:
    1. Answer: <1 Month,1.23%>
    2. Answer: <Microsoft,3.21%>
    3. Answer: <UK Equities,45.2%>,<US Equities,30.1%>
    
    Critical rules:
    - Keep question numbers exactly the same
    - Extract EXACT numbers from factsheet
    - Include % symbol
    - If data isn't shown, write 'NA'
    - Never make up data
    """
)

# Update the question generator agent instructions
question_generator = Agent(
    name="Question Generator",
    model="gpt-4o-mini",
    instructions="""You are an expert at analyzing fund factsheets. Your task is to OUTPUT A NUMBERED LIST of questions. Do not say you've prepared them - actually list them.
    
    Generate exactly 15-20 specific questions covering:
    1. Performance figures (1m, 3m, 6m, 1y, 3y, 5y)
    2. Top 10 holdings with weights
    3. Asset allocation breakdown
    4. Geographic allocation
    5. Sector breakdown
    6. Credit quality (if applicable)
    7. Duration (if applicable)
    
    Output format MUST be:
    1. What is the 1-month performance figure?
    2. What is the 3-month performance figure?
    3. What is the 6-month performance figure?
    etc.
    
    IMPORTANT: You must output the actual numbered questions, not just say you've prepared them.
    After generating questions, call transfer_to_extractor().
    """,
    functions=[transfer_to_extractor]
)

def process_fund(fund_dir, questions, question_map):
    """Process a single fund directory using pre-generated questions"""
    print(f"\nProcessing fund: {fund_dir}")
    print("\nDebug - Using Questions:")
    print(questions)
    print("\nDebug - Using Question Map:")
    for num, q in question_map.items():
        print(f"{num}: {q}")
    
    # Get images
    image_files = sorted([f for f in os.listdir(fund_dir) 
                         if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    
    print(f"Found {len(image_files)} images in {fund_dir}:")
    
    # Create list of encoded images
    content = []
    for image_file in image_files:
        print(f"  - {image_file}")
        image_path = os.path.join(fund_dir, image_file)
        try:
            with open(image_path, 'rb') as img_file:
                encoded = base64.b64encode(img_file.read()).decode('utf-8')
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded}"
                    }
                })
        except Exception as e:
            print(f"Error encoding image {image_file}: {e}")
    
    if not content:
        print("No images found or encoded!")
        return None
    
    # Format message content as a list with text and images
    message_content = [
        {
            "type": "text",
            "text": f"Please extract answers to these questions:\n\n{questions}"
        }
    ]
    message_content.extend(content)
    
    # Use data extractor with properly formatted content
    extractor_messages = [{
        "role": "user",
        "content": message_content
    }]
    
    # Get extraction response
    response = client.run(
        agent=data_extractor,
        messages=extractor_messages
    )
    
    print("\nDebug - Raw Extractor Response:")
    print(response.messages[-1]["content"])
    
    # Process into dictionary using full questions as keys
    answers = {'Fund Name': os.path.basename(fund_dir)}
    
    print("\nDebug - Parsing Answers:")
    for line in response.messages[-1]["content"].split('\n'):
        line = line.strip()
        if not line or line.startswith('**'):
            continue
            
        if line[0].isdigit() and '. Answer:' in line:
            try:
                num = int(line.split('.')[0])
                answer = line.split('Answer:')[1].strip()
                if num in question_map:
                    answers[question_map[num]] = answer
                    print(f"Successfully parsed - Q{num}: {answer}")
                else:
                    print(f"Warning: Question number {num} not found in question map")
            except Exception as e:
                print(f"Error parsing line: {line}")
                print(f"Error details: {str(e)}")
    
    print("\nDebug - Final Answers Dictionary:")
    for key, value in answers.items():
        print(f"{key}: {value}")
    
    return answers

def main():
    try:
        # Step 1: Generate questions from template factsheets
        print("Generating questions from template factsheets...")
        template_dir = "factsheet_definition"
        
        # Get template images
        template_files = sorted([f for f in os.listdir(template_dir) 
                               if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        
        print(f"\nFound {len(template_files)} template images:")
        
        # Format template content properly
        template_content = [
            {
                "type": "text",
                "text": "Please analyze these template factsheet images and generate numbered questions to extract key data."
            }
        ]
        
        for image_file in template_files:
            print(f"  - {image_file}")
            image_path = os.path.join(template_dir, image_file)
            try:
                with open(image_path, 'rb') as img_file:
                    encoded = base64.b64encode(img_file.read()).decode('utf-8')
                    template_content.append({
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded}"
                        }
                    })
            except Exception as e:
                print(f"Error encoding template image {image_file}: {e}")
        
        if len(template_content) <= 1:  # Only has text, no images
            raise Exception("No template images found!")
            
        print("\nRequesting questions generation...")
        questions_response = client.run(
            agent=question_generator,
            messages=[{
                "role": "user",
                "content": template_content
            }]
        )
        questions = questions_response.messages[-1]["content"]
        print("\nDebug - Generated Questions Raw Response:")
        print(questions)
        print("-" * 80)
        
        # Parse the questions once
        master_question_map = {}
        print("\nDebug - Parsing Questions:")
        for line in questions.split('\n'):
            line = line.strip()
            if line and line[0].isdigit() and '.' in line:
                try:
                    num = int(line.split('.')[0])
                    question = line.split('.', 1)[1].strip()
                    master_question_map[num] = question.strip().replace(',', '_')
                    print(f"Successfully parsed Q{num}: {master_question_map[num]}")
                except Exception as e:
                    print(f"Error parsing question line: {line}")
                    print(f"Error details: {str(e)}")
        
        print("\nDebug - Final Question Map:")
        for num, question in master_question_map.items():
            print(f"{num}: {question}")
        
        # Step 2: Process each fund using generated questions
        factsheets_dir = "factsheets"
        fund_dirs = [os.path.join(factsheets_dir, d) for d in os.listdir(factsheets_dir) 
                    if os.path.isdir(os.path.join(factsheets_dir, d))]
        
        print(f"\nFound {len(fund_dirs)} funds to process")
        
        # Process first fund to get columns
        first_answers = process_fund(fund_dirs[0], questions, master_question_map)
        if not first_answers:
            raise Exception("Could not process first fund!")
        
        print("\nDebug - First Fund Answers:")
        print(first_answers)
            
        # Create CSV with full questions as headers
        columns = list(first_answers.keys())
        print("\nDebug - CSV Columns:")
        print(columns)
        
        # Write the main data file
        print("\nDebug - Writing to CSV:")
        with open('extracted_fund_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.DictWriter(csvfile, fieldnames=columns)
            print("Writing headers...")
            csv_writer.writeheader()
            
            print("Writing first fund...")
            csv_writer.writerow(first_answers)
            
            # Process remaining funds
            for fund_dir in fund_dirs[1:]:
                print(f"\nProcessing next fund: {fund_dir}")
                answers = process_fund(fund_dir, questions, master_question_map)
                if answers:
                    print(f"Writing data for: {fund_dir}")
                    print("Data:", answers)
                    csv_writer.writerow(answers)
                    print(f"Saved data for: {os.path.basename(fund_dir)}")
        
        print("\nAll data has been written to extracted_fund_data.csv")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        import traceback
        print(traceback.format_exc())
def transfer_to_advisor():
    """Transfer control to the fund advisor agent"""
    return fund_advisor

# Create the fund advisor agent
fund_advisor = Agent(
    name="Fund Advisor",
    model="gpt-4o-mini",
    instructions="""You are a helpful fund advisor who can answer questions about multiple funds.
    You have access to detailed fund data and can:
    1. Compare funds
    2. Analyze performance
    3. Explain asset allocations
    4. Recommend funds based on criteria
    5. Explain fund characteristics
    
    Always be precise with numbers and include % symbols where appropriate.
    If data isn't available, clearly state that.
    """
)

def ask_advisor(query):
    """Function to ask questions about the funds"""
    try:
        # Load the CSV data
        import pandas as pd
        df = pd.read_csv('extracted_fund_data.csv')
        
        # Create context with relevant data based on the query
        context = f"""Query: {query}

Available Data:
{df.to_string()}

Please provide a clear, accurate response based on the data above."""

        # Get response from the agent
        response = client.run(
            agent=fund_advisor,
            messages=[{
                "role": "user",
                "content": context
            }]
        )
        
        return response.messages[-1]["content"]
        
    except Exception as e:
        return f"Error processing query: {str(e)}"

# Update the if __name__ == "__main__": block
if __name__ == "__main__":
    # Run the original extraction process
    main()
    
    # Then enable the advisor interface
    print("\nData extraction complete! You can now ask questions about the funds.")
    print("\nAvailable funds:")
    import pandas as pd
    df = pd.read_csv('extracted_fund_data.csv')
    for fund in df['Fund Name']:
        print(f"- {fund}")
    print("\nYou can use ask_advisor('your question') to query the data.")
if __name__ == "__main__":
    main()

Generating questions from template factsheets...

Found 4 template images:
  - gn20431c-images-0.jpg
  - gn20431c-images-1.jpg
  - gn20431c-images-2.jpg
  - gn20431c-images-3.jpg

Requesting questions generation...

Debug - Generated Questions Raw Response:
1. What is the 1-month performance figure?
2. What is the 3-month performance figure?
3. What is the 6-month performance figure?
4. What is the 1-year performance figure?
5. What is the 3-year performance figure?
6. What is the 5-year performance figure?
7. What are the top 10 holdings in the fund and their respective weights?
8. What is the asset allocation breakdown by percentage for equities, bonds, property, and cash/money market?
9. What is the geographic allocation of the fund's investments (e.g., countries or regions)?
10. What is the sector breakdown of the fund (e.g., technology, healthcare, finance)?
11. What is the credit quality of the bonds held within the fund?
12. What is the duration of the bonds in the fund?
13. Wha

In [13]:
ask_advisor("How many funds are there?")

'There are **102** funds available in the provided data.'

In [14]:
ask_advisor("What is the most common holding across all of the funds? Just return the name of the one holding.")

'The most common holding across all of the funds is **Microsoft Corporation**.'

In [15]:
ask_advisor("What is be best  performing fund across 3 years with higest percentage return?")

'Based on the available data, the best-performing fund over the last 3 years (with the highest percentage return) is:\n\n**1. Aviva_Pension_MyM_HSBC_Islamic_Global_Equity_Index_[Unnamed_Life-Pension_fund_unit]**\n   - **3-Year Performance**: **38.74%**\n\n**2. Aviva_Pension_MyM_Vanguard_US_Equity_Index_[Unnamed_Life-Pension_fund_unit]**\n   - **3-Year Performance**: **38.31%**\n\n**3. Aviva_Pension_MyM_Schroder_Sustainable_Future_Multi_Asset_[Unnamed_Life-Pension_fund_unit]**\n   - **3-Year Performance**: **14.54%**\n\nThese funds reflect the highest three-year returns among the funds listed.'