In [None]:
"""
Semantic Similarity Search - CSV Data Version
Load your CSV file and search for burnout-related content
"""

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')


class CSVSemanticSearcher:
    """
    Semantic search tool that works directly with CSV files.
    """
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the semantic searcher.
        
        Args:
            model_name: 
                - 'all-MiniLM-L6-v2' (RECOMMENDED - fast, balanced)
                - 'all-mpnet-base-v2' (more accurate, slower)
                - 'multi-qa-MiniLM-L6-cos-v1' (optimized for search)
        """
        print(f"Loading model: {model_name}...")
        self.model = SentenceTransformer(model_name)
        print("✓ Model loaded successfully!\n")
        
        self.df = None
        self.embeddings = None
        self.text_column = None
        
    def load_csv(self, 
                 csv_path: str, 
                 text_column: str,
                 encoding: str = 'utf-8',
                 **kwargs):
        """
        Load CSV file and prepare for search.
        
        Args:
            csv_path: Path to your CSV file
            text_column: Name of the column containing text to search
            encoding: File encoding (default 'utf-8', try 'latin-1' if error)
            **kwargs: Additional arguments for pd.read_csv()
        
        Example:
            searcher.load_csv('data.csv', text_column='comments')
            searcher.load_csv('data.csv', text_column='feedback', encoding='latin-1')
        """
        print(f"Loading CSV: {csv_path}")
        
        try:
            self.df = pd.read_csv(csv_path, encoding=encoding, **kwargs)
            print(f"\ Loaded {len(self.df):,} rows")
        except UnicodeDecodeError:
            print(f" Encoding error with '{encoding}', trying 'latin-1'...")
            self.df = pd.read_csv(csv_path, encoding='latin-1', **kwargs)
            print(f" Loaded {len(self.df):,} rows")
        
        # Validate column exists
        if text_column not in self.df.columns:
            available_cols = ', '.join(self.df.columns.tolist())
            raise ValueError(
                f"Column '{text_column}' not found!\n"
                f"Available columns: {available_cols}"
            )
        
        self.text_column = text_column
        
        # Handle missing values
        missing_count = self.df[text_column].isna().sum()
        if missing_count > 0:
            print(f" Found {missing_count} missing values - filling with empty strings")
            self.df[text_column] = self.df[text_column].fillna('')
        
        # Convert to string
        self.df[text_column] = self.df[text_column].astype(str)
        
        print(f" Using column: '{text_column}'")
        print(f"\nSample data:")
        for i, text in enumerate(self.df[text_column].head(3)):
            preview = text[:100] + "..." if len(text) > 100 else text
            print(f"  Row {i+1}: {preview}")
        print()
        
    def encode_data(self, batch_size: int = 32, show_progress: bool = True):
        """
        Encode all texts for semantic search.
        This step takes time but only needs to be done ONCE!
        
        Args:
            batch_size: Number of texts to encode at once (increase if you have GPU)
            show_progress: Show progress bar
        """
        if self.df is None:
            raise ValueError("No data loaded! Call load_csv() first.")
        
        texts = self.df[self.text_column].tolist()
        
        print(f"Encoding {len(texts):,} documents...")
        print("(This may take a few minutes - grab a coffee! ☕)")
        
        self.embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            convert_to_tensor=True,
            show_progress_bar=show_progress
        )
        
        print(f"✓ Encoding complete!\n")
        
    def search(self, 
               query: str,
               top_k: int = 100,
               min_score: float = 0.30,
               return_full_data: bool = True) -> pd.DataFrame:
        """
        Search for semantically similar content.
        
        Args:
            query: Search term (e.g., "burnout")
            top_k: Maximum number of results
            min_score: Minimum similarity score (0-1)
            return_full_data: If True, returns all columns from original CSV
        
        Returns:
            DataFrame with search results
        """
        if self.embeddings is None:
            raise ValueError("Data not encoded! Call encode_data() first.")
        
        print(f" Searching for: '{query}'")
        
        # Encode query
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        
        # Calculate similarities
        similarities = util.cos_sim(query_embedding, self.embeddings)[0]
        
        # Get top results
        top_results = similarities.topk(k=min(top_k, len(self.df)))
        
        # Build results
        indices = []
        scores = []
        
        for score, idx in zip(top_results.values, top_results.indices):
            score_val = score.item()
            if score_val >= min_score:
                indices.append(idx.item())
                scores.append(score_val)
        
        if len(indices) == 0:
            print(f"No results found with score >= {min_score}\n")
            return pd.DataFrame()
        
        # Create results dataframe
        if return_full_data:
            # Return all original columns plus score
            results_df = self.df.iloc[indices].copy()
        else:
            # Return only text column
            results_df = pd.DataFrame({
                self.text_column: self.df.iloc[indices][self.text_column].values
            })
        
        results_df['similarity_score'] = scores
        results_df['search_rank'] = range(1, len(results_df) + 1)
        
        # Reorder columns to put score and rank first
        cols = ['search_rank', 'similarity_score'] + [c for c in results_df.columns 
                                                        if c not in ['search_rank', 'similarity_score']]
        results_df = results_df[cols]
        
        print(f" Found {len(results_df)} results")
        print(f"  Score range: {results_df['similarity_score'].min():.3f} - {results_df['similarity_score'].max():.3f}\n")
        
        return results_df
    
    def multi_search(self,
                    queries: List[str],
                    top_k: int = 100,
                    min_score: float = 0.25,
                    combine_method: str = 'max',
                    return_full_data: bool = True) -> pd.DataFrame:
        """
        Search using multiple related queries (RECOMMENDED for better results).
        
        Args:
            queries: List of related search terms
            top_k: Max results per query
            min_score: Minimum similarity score
            combine_method: 'max' (take highest score) or 'average'
            return_full_data: Return all original columns
        
        Returns:
            Combined and deduplicated results
        """
        print(f" Multi-query search ({len(queries)} queries):")
        for q in queries:
            print(f"   • {q}")
        print()
        
        all_results = {}
        
        for query in queries:
            df = self.search(query, top_k=top_k, min_score=min_score, 
                           return_full_data=return_full_data)
            
            for idx, row in df.iterrows():
                if idx not in all_results:
                    all_results[idx] = {
                        'data': row.to_dict(),
                        'scores': [row['similarity_score']],
                        'queries': [query]
                    }
                else:
                    all_results[idx]['scores'].append(row['similarity_score'])
                    all_results[idx]['queries'].append(query)
        
        if len(all_results) == 0:
            print("✗ No results found across all queries\n")
            return pd.DataFrame()
        
        # Combine results
        combined = []
        for idx, data in all_results.items():
            row = data['data'].copy()
            
            # Calculate combined score
            if combine_method == 'max':
                row['similarity_score'] = max(data['scores'])
            else:
                row['similarity_score'] = np.mean(data['scores'])
            
            row['num_queries_matched'] = len(data['queries'])
            row['matched_queries'] = ', '.join(data['queries'])
            
            combined.append(row)
        
        results_df = pd.DataFrame(combined)
        results_df = results_df.sort_values('similarity_score', ascending=False).reset_index(drop=True)
        results_df['search_rank'] = range(1, len(results_df) + 1)
        
        # Reorder columns
        priority_cols = ['search_rank', 'similarity_score', 'num_queries_matched', 'matched_queries']
        other_cols = [c for c in results_df.columns if c not in priority_cols]
        results_df = results_df[priority_cols + other_cols]
        
        print(f"✓ Combined {len(results_df)} unique results")
        print(f"  Score range: {results_df['similarity_score'].min():.3f} - {results_df['similarity_score'].max():.3f}\n")
        
        return results_df
    
    def show_summary(self, results_df: pd.DataFrame, n: int = 10):
        """
        Display a nice summary of search results.
        
        Args:
            results_df: Results from search() or multi_search()
            n: Number of results to show
        """
        if len(results_df) == 0:
            print("No results to display.")
            return
        
        print("=" * 100)
        print(f"TOP {min(n, len(results_df))} RESULTS")
        print("=" * 100)
        
        for i, row in results_df.head(n).iterrows():
            print(f"\n[Rank {row['search_rank']}] Score: {row['similarity_score']:.3f}", end='')
            
            if 'num_queries_matched' in row:
                print(f" | Matched {row['num_queries_matched']} queries")
            else:
                print()
            
            # Show text preview
            text = str(row[self.text_column])
            preview = text[:200] + "..." if len(text) > 200 else text
            print(f"   {preview}")
            
            # Show other relevant columns (first 3 non-score columns)
            display_cols = [c for c in row.index 
                          if c not in [self.text_column, 'similarity_score', 
                                     'search_rank', 'num_queries_matched', 'matched_queries']][:3]
            if display_cols:
                print(f"   Additional data: ", end='')
                info = [f"{col}={row[col]}" for col in display_cols]
                print(", ".join(info))
        
        print("\n" + "=" * 100)


# ============================================================================
# EXAMPLE USAGE WITH YOUR CSV
# ============================================================================

if __name__ == "__main__":
    
    print("=" * 100)
    print("SEMANTIC SIMILARITY SEARCH - CSV VERSION")
    print("=" * 100)
    print()
    
    # ========================================================================
    # STEP 1: Initialize searcher
    # ========================================================================
    searcher = CSVSemanticSearcher(model_name='all-MiniLM-L6-v2')
    
    # ========================================================================
    # STEP 2: Load your CSV file
    # ========================================================================
    # CHANGE THESE VALUES TO MATCH YOUR DATA:
    searcher.load_csv(
        csv_path='posts_processed2.csv',      # ← Your CSV file path
        text_column='text'  # ← Your text column name
    )


    # Uncomment and modify as needed:
    # searcher.load_csv(csv_file, text_column=text_col)
    
    # If your file has encoding issues, try:
    # searcher.load_csv(csv_file, text_column=text_col, encoding='latin-1')
    
    # If you need to specify delimiter or other options:
    # searcher.load_csv(csv_file, text_column=text_col, sep=';', encoding='utf-8')
    
    # ========================================================================
    # STEP 3: Encode the data (do this ONCE!)
    # ========================================================================
    searcher.encode_data()
    
    # ========================================================================
    # STEP 4: Search for burnout-related content
    # ========================================================================
    
    # Single query search
    # results = searcher.search(
    #     query="burnout",
    #     top_k=100,
    #     min_score=0.30
    # )
    # searcher.show_summary(results, n=10)
    
    # ========================================================================
    # STEP 5: Multi-query search (RECOMMENDED - better results!)
    # ========================================================================
    
    burnout_queries = [
        "burnout",
        "exhaustion", 
        "work stress",
        "overwhelmed",
        "fatigue",
        "drained",
        "mental health work"
    ]
    
    results = searcher.multi_search(
        queries=burnout_queries,
        top_k=100,
        min_score=0.25
    )
    
    searcher.show_summary(results, n=20)
    
    # ========================================================================
    # STEP 6: Save results
    # ========================================================================
    
    results.to_csv('burnout_search_results.csv', index=False)
    print("\n Results saved to: burnout_search_results.csv")
    
    # ========================================================================
    # STEP 7: Analyze results
    # ========================================================================
    
    print(f"\n{'='*100}")
    print("ANALYSIS")
    print(f"{'='*100}")
    print(f"Total results found: {len(results)}")
    print(f"Score distribution:")
    print(f"  > 0.60 (high confidence): {len(results[results['similarity_score'] > 0.60])}")
    print(f"  0.40-0.60 (medium):       {len(results[(results['similarity_score'] >= 0.40) & (results['similarity_score'] <= 0.60)])}")
    print(f"  < 0.40 (low):             {len(results[results['similarity_score'] < 0.40])}")
    
    print("\n" + "="*100)
    print("INSTRUCTIONS:")
    print("="*100)


SEMANTIC SIMILARITY SEARCH - CSV VERSION

Loading model: all-MiniLM-L6-v2...
✓ Model loaded successfully!



ValueError: No data loaded! Call load_csv() first.