<a href="https://colab.research.google.com/github/maciejjkowara/semantic_search/blob/main/semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages (run once)
# !pip install sentence-transformers
# !pip install faiss-cpu==1.7.4 --force-reinstall
# !pip install numpy==1.24.3 --force-reinstall
!pip install sentence-transformers
!pip install faiss-cpu

In [None]:
# ============================================
# SEMANTIC SEARCH FOR FUND ANALYSES
# ============================================

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import json
import re

# ============================================
# CONFIGURATION
# ============================================

# Which asset class to search
ASSET_CLASS = 'equity'  # Options: 'equity', 'fixed_income', 'allocation'

# Number of results to return
TOP_K = 5

# ============================================
# LOAD MODEL AND DATA
# ============================================

print("="*80)
print("LOADING MODEL AND EMBEDDINGS")
print("="*80)

# Load the same model used for embedding
print("\nLoading sentence transformer model...")
embedder = SentenceTransformer(
    'Alibaba-NLP/gte-base-en-v1.5',
    trust_remote_code=True,
    device='cpu'
)
embedder.max_seq_length = 4096
print("✓ Model loaded!")

# # Load embeddings and FAISS index
# print(f"\nLoading {ASSET_CLASS} embeddings and index...")
# embeddings = np.load(f'morningstar_embeddings_{ASSET_CLASS}_embeddings.npy')
# index = faiss.read_index(f'morningstar_embeddings_{ASSET_CLASS}_index.faiss')
# print(f"✓ Loaded {len(embeddings)} embeddings")

# # Load metadata
# print(f"\nLoading {ASSET_CLASS} metadata...")
# with open(f'{ASSET_CLASS}_analyses_metadata.json', 'r') as f:
#     metadata = json.load(f)
# print(f"✓ Loaded metadata for {len(metadata)} analyses")

# # Load original text for display
# print(f"\nLoading {ASSET_CLASS} analysis texts...")
# with open(f'{ASSET_CLASS}_analyses.txt', 'r', encoding='utf-8') as f:
#     content = f.read()
# analyses_texts = content.split('</analysis>')
# analyses_texts = [a.strip() + '</analysis>' for a in analyses_texts
#                   if a.strip() and '<analysis>' in a]
# print(f"✓ Loaded {len(analyses_texts)} analysis texts")

# ============================================
# HELPER FUNCTIONS
# ============================================

def clean_text_for_embedding(text):
    """Remove metadata tags (same as embedding creation)"""
    text = re.sub(r'<author>.*?</author>', '', text, flags=re.DOTALL)
    text = re.sub(r'<date>.*?</date>', '', text, flags=re.DOTALL)
    return text.strip()

def extract_section(text, section_name):
    """Extract a specific section from analysis"""
    pattern = f'<{section_name}>(.*?)</{section_name}>'
    match = re.search(pattern, text, flags=re.DOTALL)
    if match:
        return match.group(1).strip()
    return "Not available"

def search_similar_analyses(query_text, top_k=5):
    """Search for similar analyses given a query description"""

    # Clean and embed the query (same as we did for the analyses)
    cleaned_query = clean_text_for_embedding(query_text)

    print(f"\nEmbedding query...")
    query_embedding = embedder.encode(
        [cleaned_query],
        normalize_embeddings=True,
        convert_to_numpy=True
    )

    # Search FAISS index
    print(f"Searching for top {top_k} similar analyses...")
    distances, indices = index.search(query_embedding.astype('float32'), top_k)

    # Compile results
    results = []
    for rank, (idx, distance) in enumerate(zip(indices[0], distances[0]), 1):
        result = {
            'rank': rank,
            'index': int(idx),
            'distance': float(distance),
            'similarity_score': float(1 / (1 + distance)),  # Convert distance to similarity
            'fund_name': metadata[idx]['fund_name'],
            'category': metadata[idx]['category'],
            'asset_class': metadata[idx]['asset_class'],
            'date': metadata[idx]['date'],
            'author': metadata[idx]['author'],
            'full_text': analyses_texts[idx]
        }
        results.append(result)

    return results

def print_results(results, query_text):
    """Pretty print search results with full analysis text"""
    print("\n" + "="*80)
    print("SEARCH RESULTS")
    print("="*80)

    print(f"\nQuery: {query_text[:200]}...")
    print(f"\nFound {len(results)} similar analyses:\n")

    for r in results:
        print("\n" + "="*80)
        print(f"RESULT #{r['rank']}")
        print("="*80)
        print(f"Similarity Score: {r['similarity_score']:.3f} | Distance: {r['distance']:.4f}")
        print(f"Fund: {r['fund_name']}")
        print(f"Category: {r['category']} | Asset Class: {r['asset_class']}")
        print(f"Date: {r['date']} | Author: {r['author']}")
        print("\n" + "-"*80)
        print("FULL ANALYSIS:")
        print("-"*80)

        # Extract and display each section
        full_text = r['full_text']

        print("\n[SUMMARY]")
        print(extract_section(full_text, 'summary'))

        print("\n[PEOPLE]")
        print(extract_section(full_text, 'people'))

        print("\n[PROCESS]")
        print(extract_section(full_text, 'process'))

        print("\n[PORTFOLIO]")
        print(extract_section(full_text, 'portfolio'))

        print("\n[PERFORMANCE]")
        print(extract_section(full_text, 'performance'))

        print("\n" + "="*80 + "\n")

In [None]:
query = """
Value, but not deep value.
Large caps, dividend paying stocks.
Definition of value somewhat subjective--not just valuation metrics, but value relative to growth potential.
Historically has placed close to the border of large value and large blend boxes, typically megacaps.
Most recent positioning: overweight financials, utilities; underweight materials.
Process has been consistent for the past many years.
Performance has been good relative to peers across most trailing period returns. But it failed to outperform the Russell 1000 Value index for the past 10 and 15 years.
Management has been stable for years and the supporting team is well staffed and experienced.
"""

In [None]:

ASSET_CLASS = 'equity'

# Reload the fixed income data
embeddings = np.load(f'morningstar_embeddings_{ASSET_CLASS}_embeddings.npy')
index = faiss.read_index(f'morningstar_embeddings_{ASSET_CLASS}_index.faiss')
with open(f'{ASSET_CLASS}_analyses_metadata.json', 'r') as f:
    metadata = json.load(f)
with open(f'{ASSET_CLASS}_analyses.txt', 'r', encoding='utf-8') as f:
    content = f.read()
analyses_texts = content.split('</analysis>')
analyses_texts = [a.strip() + '</analysis>' for a in analyses_texts
                  if a.strip() and '<analysis>' in a]

results = search_similar_analyses(query, top_k=TOP_K)
print_results(results, query)