In [None]:
import spacy
from datetime import datetime

nlp = spacy.load("en_core_web_sm")


def extract_investment_info(prompt):
    def extract_dates_and_budget(doc):
        budget = None
        start_date = None
        end_date = datetime.now().strftime("%Y-%m-%d")  # Default to current date
        date_entities = []
        money_entities = []
        
        for ent in doc.ents:
            if ent.label_ == "MONEY":
                money_entities.append(ent)
            elif ent.label_ == "DATE":
                date_entities.append(ent.text)
        
        # Identify start and end dates based on the order and number of digits
        if len(date_entities) >= 2:
            start_date = date_entities[0]
            end_date = date_entities[1]
        if len(date_entities) == 3:
            end_date = date_entities[2]
        
        # Identify the budget closest to the word "budget"
        budget_pos = prompt.lower().find("budget")
        if budget_pos != -1 and money_entities:
            closest_money_entity = min(money_entities, key=lambda ent: abs(budget_pos - ent.start_char))
            budget = closest_money_entity.text
        
        return budget, start_date, end_date
    
    def extract_sectors(doc, prompt):
        avoid_sectors = []
        invest_sectors = []
        avoid_keywords = ["avoids", "avoid", "not invest in"]
        
        for token in doc:
            if any(keyword in prompt.lower() for keyword in avoid_keywords):
                if token.text.lower() in ["real estate", "construction", "trade", "service", "finance", "crypto assets", "industrial applications", "manufacturing", "energy", "transportation"]:
                    avoid_sectors.append(token.text.lower())
            elif token.text.lower() in ["technology", "healthcare", "energy"]:  # Add more sectors as needed
                invest_sectors.append(token.text.lower())
        
        return avoid_sectors, invest_sectors
    
    def convert_date_format(date_str):
        try:
            date_obj = datetime.strptime(date_str, "%B %dth, %Y")
            return date_obj.strftime("%Y-%m-%d")
        except ValueError:
            return None
    
    doc = nlp(prompt)
    budget, start_date, end_date = extract_dates_and_budget(doc)
    
    # Check if the start date contains "years old" or cannot be converted to yyyy-mm-dd format
    if start_date and ("years old" in start_date or not convert_date_format(start_date)):
        prompt = prompt.replace(start_date, "")
        doc = nlp(prompt)
        budget, start_date, end_date = extract_dates_and_budget(doc)
    
    # Convert dates to yyyy-mm-dd format
    if start_date:
        start_date = convert_date_format(start_date)
    if end_date:
        end_date = convert_date_format(end_date)
    
    avoid_sectors, invest_sectors = extract_sectors(doc, prompt)
    
    return {
        "budget": budget,
        "start_date": start_date,
        "end_date": end_date,
        "avoid_sectors": avoid_sectors,
        "invest_sectors": invest_sectors
    }