<a href="https://colab.research.google.com/github/oluwafemidiakhoa/Mindserach/blob/master/ETF_and_Mutual_Fund_Search_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install gradio transformers pandas


Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [11]:
# Let's unzip the file using Python to check its contents
import zipfile
import os

# Path to the uploaded zip file and extraction directory
zip_file_path = '/content/Fiance_1.zip'
extracted_dir_path = '/content//extracted_data/'

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir_path)

# List the contents of the extracted directory to see the files
extracted_files = os.listdir(extracted_dir_path)
extracted_files


['ETFs.csv',
 'MutualFund prices - F-K.csv',
 'MutualFund prices - Q-Z.csv',
 'MutualFunds.csv',
 'MutualFund prices - L-P.csv',
 'ETF prices.csv',
 'MutualFund prices - A-E.csv']

In [9]:
# Import necessary libraries for data handling and transformers
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline

# Load ETF and Mutual Fund data
etfs_df = pd.read_csv('/content/extracted_data/ETFs.csv')
mutual_funds_df = pd.read_csv('/content/extracted_data/MutualFunds.csv')



# Step 3: Define a classifier for detecting unsafe financial advice (toxicity detection)
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")

# Step 4: Load a pre-trained language model and tokenizer (using GPT-2 as an example)
model_name = "gpt2"  # Use a valid public model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 5: Define the function to search for ETFs or Mutual Funds based on refined user query
def search_financial_data(query, etf_df, mf_df, filters=None):
    # Initialize result
    result = None

    # Check if the query relates to ETFs
    if "ETF" in query:
        # Start with basic symbol or name matching
        result = etf_df[
            etf_df['fund_symbol'].str.contains(query.split()[-1], case=False) |
            etf_df['fund_short_name'].str.contains(query.split()[-1], case=False, na=False) |
            etf_df['fund_long_name'].str.contains(query.split()[-1], case=False, na=False)
        ]

        # Apply additional filters if provided
        if filters:
            # Step 5.1: Filter by category (e.g., Large Blend, Small Value)
            if 'fund_category' in filters:
                result = result[result['fund_category'].str.contains(filters['fund_category'], case=False, na=False)]

            # Step 5.2: Filter by minimum yield (e.g., minimum dividend yield)
            if 'min_yield' in filters:
                result = result[result['fund_yield'] >= filters['min_yield']]

            # Step 5.3: Filter by minimum return in 1 year
            if 'min_return_1year' in filters:
                result = result[result['fund_return_1year'] >= filters['min_return_1year']]

            # Step 5.4: Filter by specific top holdings (e.g., ETFs holding Apple stock)
            if 'top_holdings' in filters:
                result = result[result['top10_holdings'].str.contains(filters['top_holdings'], case=False, na=False)]

            # Step 5.5: Filter by fund size (e.g., Large, Medium, Small cap)
            if 'size_type' in filters:
                result = result[result['size_type'].str.contains(filters['size_type'], case=False, na=False)]

            # Step 5.6: Filter by region (e.g., US, Europe, Asia)
            if 'region' in filters:
                result = result[result['region'].str.contains(filters['region'], case=False, na=False)]

            # Step 5.7: Filter by minimum return over 3 years
            if 'min_return_3years' in filters:
                result = result[result['fund_return_3years'] >= filters['min_return_3years']]

            # Step 5.8: Filter by minimum return over 5 years
            if 'min_return_5years' in filters:
                result = result[result['fund_return_5years'] >= filters['min_return_5years']]

            # Step 5.9: Filter by minimum return over 10 years
            if 'min_return_10years' in filters:
                result = result[result['fund_return_10years'] >= filters['min_return_10years']]

            # Step 5.10: Filter by risk metrics like Sharpe Ratio over 5 years
            if 'min_sharpe_ratio_5years' in filters:
                result = result[result['fund_sharpe_ratio_5years'] >= filters['min_sharpe_ratio_5years']]

        # Return results or message if no match found
        if result.empty:
            return "No matching ETFs found."
        return result.to_dict(orient='records')

    # Step 6: Check if the query relates to Mutual Funds (similar filters can be applied)
    elif "Mutual Fund" in query:
        result = mf_df[
            mf_df['fund_symbol'].str.contains(query.split()[-1], case=False) |
            mf_df['fund_short_name'].str.contains(query.split()[-1], case=False, na=False) |
            mf_df['fund_long_name'].str.contains(query.split()[-1], case=False, na=False)
        ]

        # Apply similar filters for Mutual Funds (if needed)
        if filters:
            # Mutual fund filters can be added here
            pass

        if result.empty:
            return "No matching Mutual Funds found."
        return result.to_dict(orient='records')

    return "Query unclear, please specify whether you are searching for ETFs or Mutual Funds."

# Step 7: Define the backtracking mechanism with classifier and financial data
def generate_with_backtracking(prompt, model, tokenizer, classifier, etf_df, mf_df, filters=None):
    # Step 7.1: Search financial data first if relevant to prompt
    financial_results = search_financial_data(prompt, etf_df, mf_df, filters)

    if financial_results != "Query unclear, please specify whether you are searching for ETFs or Mutual Funds.":
        return financial_results  # Return financial data if found

    # Step 7.2: Otherwise, continue to generate text and check for unsafe responses
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_new_tokens=50, do_sample=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

    # Step 7.3: Use classifier to detect unsafe content
    classification = classifier(generated_text)

    # If unsafe content is detected, backtrack and generate a safe response
    if classification[0]['label'] == 'toxic' and classification[0]['score'] > 0.5:  # Threshold for toxicity
        reset_point = generated_text.find(classification[0]['label'])
        safe_text = generated_text[:reset_point]
        safe_text += "[RESET] This response has been adjusted for safety."  # Simulate reset and safe generation
        return safe_text

    return generated_text

# Step 8: Define the query with refined filters
prompt = "Search for ETF SPY"
filters = {
    'fund_category': 'Large Blend',       # Filter by fund category
    'min_yield': 0.01,                    # Minimum yield of 1%
    'min_return_1year': 0.1,              # Minimum return of 10% in 1 year
    'top_holdings': 'Apple',              # ETFs holding Apple
    'size_type': 'Large',                 # Large cap funds
    'region': 'US',                       # US-based ETFs
    'min_return_3years': 0.05,            # Minimum 5% return over 3 years
    'min_sharpe_ratio_5years': 0.5        # Minimum Sharpe ratio of 0.5 over 5 years
}

# Step 9: Call the function with refined filters
generated_response = generate_with_backtracking(prompt, model, tokenizer, toxicity_classifier, etfs_df, mutual_funds_df, filters)

# Step 10: Display results in a table format if it's a financial query response
if isinstance(generated_response, list):
    df_display = pd.DataFrame(generated_response)
    df_display = df_display[['fund_symbol', 'fund_long_name', 'region', 'currency', 'fund_category', 'fund_yield', 'top10_holdings', 'fund_return_1year', 'fund_return_5years']]

    # Use pandas to display the DataFrame in a simple print format
    print(df_display)
else:
    print("Generated Response with Backtracking: ", generated_response)








  fund_symbol   fund_long_name region currency fund_category  fund_yield  \
0        SPYX  BFS Equity Fund     US      USD   Large Blend      0.0114   

                                      top10_holdings  fund_return_1year  \
0  MSFT ("Microsoft Corp"): 0.0495, AMZN ("Amazon...              0.409   

   fund_return_5years  
0               0.185  


In [13]:
## Step 1: Import necessary libraries
import pandas as pd
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load ETF and Mutual Fund data
etfs_df = pd.read_csv('/content/extracted_data/ETFs.csv')
mutual_funds_df = pd.read_csv('/content/extracted_data/MutualFunds.csv')

# Step 3: Define a classifier for detecting unsafe financial advice (toxicity detection)
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")

# Step 4: Load a pre-trained language model and tokenizer (using GPT-2 as an example)
model_name = "gpt2"  # Use a valid public model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 5: Define the function to search for ETFs or Mutual Funds based on refined user query
def search_financial_data(query, etf_df, mf_df, filters=None):
    # Initialize result
    result = None

    # Check if the query relates to ETFs
    if "ETF" in query:
        result = etf_df[
            etf_df['fund_symbol'].str.contains(query.split()[-1], case=False) |
            etf_df['fund_short_name'].str.contains(query.split()[-1], case=False, na=False) |
            etf_df['fund_long_name'].str.contains(query.split()[-1], case=False, na=False)
        ]

        # Apply additional filters if provided
        if filters:
            if 'fund_category' in filters:
                result = result[result['fund_category'].str.contains(filters['fund_category'], case=False, na=False)]
            if 'min_yield' in filters:
                result = result[result['fund_yield'] >= filters['min_yield']]
            if 'min_return_1year' in filters:
                result = result[result['fund_return_1year'] >= filters['min_return_1year']]
            if 'top_holdings' in filters:
                result = result[result['top10_holdings'].str.contains(filters['top_holdings'], case=False, na=False)]
            if 'size_type' in filters:
                result = result[result['size_type'].str.contains(filters['size_type'], case=False, na=False)]
            if 'region' in filters:
                result = result[result['region'].str.contains(filters['region'], case=False, na=False)]
            if 'min_return_3years' in filters:
                result = result[result['fund_return_3years'] >= filters['min_return_3years']]
            if 'min_sharpe_ratio_5years' in filters:
                result = result[result['fund_sharpe_ratio_5years'] >= filters['min_sharpe_ratio_5years']]

        if result.empty:
            return "No matching ETFs found."
        return result.to_dict(orient='records')

    return "Query unclear, please specify whether you are searching for ETFs or Mutual Funds."

# Step 6: Define the backtracking mechanism
def generate_with_backtracking(prompt, model, tokenizer, classifier, etf_df, mf_df, filters=None):
    financial_results = search_financial_data(prompt, etf_df, mf_df, filters)

    if financial_results != "Query unclear, please specify whether you are searching for ETFs or Mutual Funds.":
        return pd.DataFrame(financial_results)  # Return financial data in table format

    # Generate text using GPT-2 if no financial data found
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_new_tokens=50, do_sample=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

    classification = classifier(generated_text)

    if classification[0]['label'] == 'toxic' and classification[0]['score'] > 0.5:
        reset_point = generated_text.find(classification[0]['label'])
        safe_text = generated_text[:reset_point] + "[RESET] This response has been adjusted for safety."
        return safe_text

    return generated_text

# Step 7: Define the function to be used in the Gradio interface
def query_etfs(prompt, fund_category, min_yield, min_return_1year, top_holdings, size_type, region, min_return_3years, min_sharpe_ratio_5years):
    filters = {
        'fund_category': fund_category,
        'min_yield': min_yield,
        'min_return_1year': min_return_1year,
        'top_holdings': top_holdings,
        'size_type': size_type,
        'region': region,
        'min_return_3years': min_return_3years,
        'min_sharpe_ratio_5years': min_sharpe_ratio_5years
    }

    return generate_with_backtracking(prompt, model, tokenizer, toxicity_classifier, etfs_df, mutual_funds_df, filters)

# Step 8: Define the Gradio interface layout
inputs = [
    gr.components.Textbox(label="Enter Query (e.g., Search for ETF SPY)", placeholder="Type your query here"),
    gr.components.Textbox(label="Fund Category (e.g., Large Blend)", placeholder="e.g., Large Blend"),
    gr.components.Slider(0.0, 1.0, label="Minimum Yield (e.g., 0.01 for 1%)"),
    gr.components.Slider(0.0, 1.0, label="Minimum 1-Year Return (e.g., 0.1 for 10%)"),
    gr.components.Textbox(label="Top Holdings (e.g., Apple)", placeholder="e.g., Apple"),
    gr.components.Textbox(label="Size Type (e.g., Large)", placeholder="e.g., Large"),
    gr.components.Textbox(label="Region (e.g., US)", placeholder="e.g., US"),
    gr.components.Slider(0.0, 1.0, label="Minimum 3-Year Return"),
    gr.components.Slider(0.0, 1.0, label="Minimum Sharpe Ratio (5 Years)")
]

# Step 9: Create the Gradio interface
outputs = gr.components.Dataframe(label="Filtered ETFs/Generated Text")

# Step 10: Launch the Gradio interface
gr.Interface(fn=query_etfs, inputs=inputs, outputs=outputs, title="ETF and Mutual Fund Search App").launch()





Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://77b8981de9ff1619c6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


