<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/Data_Analyst.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install anthropic
!pip install sentence-transformers
!pip install pandas
!pip install yake

In [None]:
import pandas as pd
import numpy as np
import anthropic
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import logging
from typing import List, Dict, Any, Optional, Tuple
import logging
import traceback

In [None]:
from google.colab import userdata
api_key = userdata.get('Agent_Anth')

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
#Service Classes
class CSVReader:
    """Tool to read CSV files into pandas dataframes."""

    def read_csv(self, file_path: str) -> Tuple[pd.DataFrame, str]:
        """
        Read a CSV file into a pandas dataframe.

        Args:
            file_path: Path to the CSV file

        Returns:
            Tuple of (dataframe, message)
        """
        try:
            df = pd.read_csv(file_path)
            columns_info = ", ".join([f"{col} ({df[col].dtype})" for col in df.columns])
            message = f"Successfully loaded CSV with {len(df)} rows and {len(df.columns)} columns: {columns_info}"
            logger.info(message)
            return df, message
        except Exception as e:
            error_msg = f"Error reading CSV file: {str(e)}"
            logger.error(error_msg)
            return pd.DataFrame(), error_msg

class QueryAnalyzer:
    """Tool to analyze queries and determine information needs."""

    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize with Anthropic API key.

        Args:
            api_key: Anthropic API key (will use environment variable if None)
        """
        self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
        if not self.api_key:
            logger.warning("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable.")

        self.client = anthropic.Anthropic(api_key=self.api_key)

    def analyze_query(self, query: str, columns: List[str]) -> Dict[str, Any]:
        """
        Analyze the query to determine information needs.

        Args:
            query: The user's query
            columns: List of available columns in the dataframe

        Returns:
            Dict with analysis results
        """
        try:
            prompt = f"""
            <columns>
            {', '.join(columns)}
            </columns>

            You are an AI assistant that analyzes queries about a dataset. Based on the user query, determine:
            1. Which columns from the dataset are needed to answer the query
            2. What type of analysis is required (filtering, aggregation, etc.)
            3. Whether any specific values or conditions are mentioned

            User query: {query}

            Respond in JSON format like this:
            {{
                "needed_columns": ["column1", "column2"],
                "analysis_type": "one of: filtering, aggregation, sorting, comparison, general_info, semantic_search",
                "filter_conditions": {{"column_name": "filter_value"}},
                "aggregation_function": "one of: count, sum, average, min, max, none",
                "sort_by": "column_name or null",
                "sort_order": "ascending or descending or null",
                "requires_text_search": true/false,
                "search_term": "term to search for in text or null"
                "query": "original query"
            }}

            Make sure all column names exactly match the provided list. If a column is not mentioned or needed, don't include it.
            """

            response = self.client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=1000,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            analysis_text = response.content[0].text
            print(f'query_analyzer printout: response:{analysis_text}')

            # Extract JSON from response
            import json
            import re

            json_match = re.search(r'{[\s\S]+}', analysis_text)
            if json_match:
                analysis = json.loads(json_match.group(0))
                logger.info(f"Query analysis completed: {str(analysis)}")
                return analysis
            else:
                logger.error("Failed to extract JSON from Claude's response")
                return {
                    "needed_columns": columns,
                    "analysis_type": "general_info",
                    "requires_text_search": False
                }

        except Exception as e:
            logger.error(f"Error analyzing query: {str(e)}")
            return {
                "needed_columns": columns,
                "analysis_type": "general_info",
                "requires_text_search": False
            }

class ColumnSelector:
    """Tool to determine which columns are needed for a query."""

    def select_columns(self, df: pd.DataFrame, analysis: Dict[str, Any]) -> List[str]:
        """
        Select columns needed to answer the query.

        Args:
            df: The dataframe
            analysis: Query analysis results

        Returns:
            List of column names to use
        """
        all_columns = df.columns.tolist()

        # Start with columns specified in the analysis
        needed_columns = analysis.get("needed_columns", [])

        # Always include text column if text search is required
        if analysis.get("requires_text_search", False) and "text" in all_columns:
            if "text" not in needed_columns:
                needed_columns.append("text")

        # Add filter columns if not already included
        filter_conditions = analysis.get("filter_conditions", {})
        for col in filter_conditions.keys():
            if col in all_columns and col not in needed_columns:
                needed_columns.append(col)

        # Add sort column if not already included
        sort_by = analysis.get("sort_by")
        if sort_by and sort_by in all_columns and sort_by not in needed_columns:
            needed_columns.append(sort_by)

        # If no columns were determined, return all columns
        if not needed_columns:
            logger.warning("No specific columns determined, using all columns")
            needed_columns = all_columns

        logger.info(f"Selected columns: {', '.join(needed_columns)}")
        print(f"Column Selector Printout: Selected columns: {', '.join(needed_columns)}")
        return needed_columns

class DataExtractor:
    """Tool to extract relevant data from the dataframe."""

    def extract_data(self, df: pd.DataFrame, analysis: Dict[str, Any], selected_columns: List[str]) -> pd.DataFrame:
        """
        Extract relevant data based on query analysis.

        Args:
            df: The dataframe
            analysis: Query analysis results
            selected_columns: Columns to include

        Returns:
            Filtered dataframe
        """
        try:
            # Start with selected columns
            result_df = df[selected_columns].copy()

            # Apply filtering if specified
            filter_conditions = analysis.get("filter_conditions", {})
            if filter_conditions and analysis.get("analysis_type") in ["filtering", "comparison"]:
                for col, value in filter_conditions.items():
                    if col in df.columns:
                        # Handle different filter types
                        if isinstance(value, dict):
                            # Range or comparison filter
                            if "min" in value and "max" in value:
                                result_df = result_df[(result_df[col] >= value["min"]) &
                                                     (result_df[col] <= value["max"])]
                            elif "min" in value:
                                result_df = result_df[result_df[col] >= value["min"]]
                            elif "max" in value:
                                result_df = result_df[result_df[col] <= value["max"]]
                            elif "not_equal" in value:
                                result_df = result_df[result_df[col] != value["not_equal"]]
                        elif isinstance(value, list):
                            # List of values
                            result_df = result_df[result_df[col].isin(value)]
                        else:
                            # Simple equality
                            result_df = result_df[result_df[col] == value]

            # Apply sorting if specified
            sort_by = analysis.get("sort_by")
            sort_order = analysis.get("sort_order", "ascending")
            if sort_by and sort_by in result_df.columns:
                ascending = sort_order.lower() != "descending"
                result_df = result_df.sort_values(by=sort_by, ascending=ascending)

            # Apply aggregation if specified
            agg_function = analysis.get("aggregation_function")
            if agg_function and agg_function != "none" and analysis.get("analysis_type") == "aggregation":
                # Determine which column to aggregate
                agg_col = None
                for col in result_df.columns:
                    if col != "text" and pd.api.types.is_numeric_dtype(result_df[col]):
                        agg_col = col
                        break

                if agg_col:
                    if agg_function == "count":
                        result_df = result_df.groupby(selected_columns[0])[agg_col].count().reset_index()
                    elif agg_function == "sum":
                        result_df = result_df.groupby(selected_columns[0])[agg_col].sum().reset_index()
                    elif agg_function == "average":
                        result_df = result_df.groupby(selected_columns[0])[agg_col].mean().reset_index()
                    elif agg_function == "min":
                        result_df = result_df.groupby(selected_columns[0])[agg_col].min().reset_index()
                    elif agg_function == "max":
                        result_df = result_df.groupby(selected_columns[0])[agg_col].max().reset_index()

            logger.info(f"Extracted {len(result_df)} rows of data")
            print(f"Data Extractor Printout: {result_df}")
            return result_df

        except Exception as e:
            logger.error(f"Error extracting data: {str(e)}")
            # Return original data with selected columns
            return df[selected_columns].copy()

class TextEmbedder:
    """Tool to generate and search text embeddings."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize with embedding model.

        Args:
            model_name: Name of the embedding model
        """
        self.model = SentenceTransformer(model_name)
        logger.info(f"Initialized embedding model: {model_name}")

    def search_similar_texts(self, df: pd.DataFrame, query: str, text_column: str = "text",
                            top_k: int = 5) -> pd.DataFrame:
       """
        Find texts most similar to the query.

        Args:
            df: Dataframe with text column
            query: Search query
            text_column: Column containing text
            top_k: Number of results to return

        Returns:
            Dataframe with most similar texts
        """
        print('I am inside searching simular texts')
        if text_column not in df.columns:
            logger.error(f"Text column '{text_column}' not found in dataframe")
            return df

        try:
            # Generate embeddings
            print('trying to generate embeddings')
            texts = df[text_column].fillna("").tolist()
            print(f'text_embeddings printout: {texts}')
            text_embeddings = self.model.encode(texts)
            print('got the text embeddings!')
            print(f'input query printout: {query} of type {type(query)}')
            query_embedding = self.model.encode(query)
            print('got the query embeddings!')

            # Calculate similarities
            similarities = cosine_similarity(
                query_embedding.reshape(1, -1),
                text_embeddings
            )[0]

            # Add similarity scores to dataframe
            result_df = df.copy()
            result_df["similarity_score"] = similarities

            # Sort by similarity and take top_k
            result_df = result_df.sort_values("similarity_score", ascending=False).head(top_k)

            logger.info(f"Found {len(result_df)} similar texts")
            print(f"search_similar_texts printout: {result_df}")
            return result_df

        except Exception as e:
            logger.error(f"Error searching similar texts: {str(e)}")
            print(traceback.format_exc())  # Print full traceback
            return df

class AnswerGenerator:
    """Tool to generate answers using Claude 3.7."""

    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize with Anthropic API key.

        Args:
            api_key: Anthropic API key (will use environment variable if None)
        """
        self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
        if not self.api_key:
            logger.warning("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable.")

        self.client = anthropic.Anthropic(api_key=self.api_key)

    def generate_answer(self, query: str, data_df: pd.DataFrame, analysis: Dict[str, Any]) -> str:
        """
        Generate an answer using Claude 3.7.

        Args:
            query: User query
            data_df: Dataframe with relevant data
            analysis: Query analysis results

        Returns:
            Generated answer
        """
        try:
            # Convert dataframe to string representation
            data_str = data_df.to_string(index=False) if not data_df.empty else "No data found"

            # Create prompt for Claude
            prompt = f"""
            <data>
            {data_str}
            </data>

            <query_analysis>
            {str(analysis)}
            </query_analysis>

            User query: {query}

            Based on the provided data and analysis of the query, please provide a comprehensive answer to the user's question.
            Include specific details from the data where appropriate. If the data doesn't contain information needed to answer the query,
            state that clearly.

            Answer the query directly and concisely. If appropriate, include any relevant statistics from the data.
            """

            response = self.client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=2000,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            answer = response.content[0].text
            logger.info(f"Generated answer of length {len(answer)}")
            return answer

        except Exception as e:
            logger.error(f"Error generating answer: {str(e)}")
            return f"I encountered an error while generating the answer: {str(e)}"

In [None]:
#logger = logging.getLogger(__name__)

class CSVAgent:
    """Agent that analyzes CSV data to answer queries."""

    def __init__(self, csv_path, api_key=api_key):
        """
        Initialize the agent.

        Args:
            csv_path: Path to the CSV file
            api_key: Anthropic API key (optional)
        """
        self.csv_path = csv_path
        self.api_key = api_key

        #Refactoring Prep:
        #self.agent_client = anthropic.Anthropic(api_key=self.api_key)
        #self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize tools
        self.csv_reader = CSVReader()
        self.query_analyzer = QueryAnalyzer(api_key=self.api_key)
        self.column_selector = ColumnSelector()
        self.data_extractor = DataExtractor()
        self.text_embedder = TextEmbedder()
        self.answer_generator = AnswerGenerator(api_key=self.api_key)

        # Load CSV data
        self.df, load_message = self.csv_reader.read_csv(csv_path)
        logger.info(load_message)

        # Store column information
        self.columns = list(self.df.columns) if not self.df.empty else []

    def process_query(self, query):
        """
        Process a user query and generate an answer.

        Args:
            query: User query string

        Returns:
            dict: Response containing answer and processing details
        """
        logger.info(f"Processing query: {query}")

        if self.df.empty:
            return {
                "answer": "Unable to analyze the CSV file. Please check the file path and format.",
                "success": False
            }

        try:
            # Step 1: Analyze the query
            analysis = self.query_analyzer.analyze_query(query, self.columns)

            # Step 2: Select relevant columns
            selected_columns = self.column_selector.select_columns(self.df, analysis)

            # Step 3: Extract relevant data
            extracted_data = self.data_extractor.extract_data(self.df, analysis, selected_columns)

            # Step 4: Apply text search if needed
            if analysis.get("requires_text_search", False) and "text" in self.columns:
                if analysis.get("search_term") != None:
                  search_term = analysis.get("search_term")
                else:
                  search_term = analysis.get("query")
                extracted_data = self.text_embedder.search_similar_texts(
                    extracted_data,
                    search_term,
                    text_column="text",
                    top_k=10
                )

            # Step 5: Generate answer
            answer = self.answer_generator.generate_answer(query, extracted_data, analysis)

            return {
                "answer": answer,
                "columns_analyzed": selected_columns,
                "rows_analyzed": len(extracted_data),
                "analysis_type": analysis.get("analysis_type", "unknown"),
                "success": True
            }
            '''
            return{
                "success":True
            }
            '''
        except Exception as e:
            logger.error(f"Error processing query: {str(e)}")
            return {
                "answer": f"An error occurred while processing your query: {str(e)}",
                "success": False
            }

In [None]:
def setup_logging():
    """Set up logging configuration."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler("csv_agent.log"),
            logging.StreamHandler()
        ]
    )

setup_logging()

In [None]:
%%capture
# Initialize the agent
agent = CSVAgent('alpha_test.csv')

queries examples:

" Please summarise texts about the Caldera Ginger Beer with the number_apperance 4"

In [None]:
query = " Please find all beers that have the number_apperance 2.5 and return their names and the summary of texts for these beers"

result = agent.process_query(query)

print(f"\nAnswer: {result['answer']}\n")
if result['success']:
    print(f"Analysis type: {result['analysis_type']}")
    print(f"Columns analyzed: {', '.join(result['columns_analyzed'])}")
    print(f"Rows analyzed: {result['rows_analyzed']}")


query_analyzer printout: response:{
    "needed_columns": ["string_name", "text"],
    "analysis_type": "filtering",
    "filter_conditions": {"number_appearance": 2.5},
    "aggregation_function": "none",
    "sort_by": null,
    "sort_order": null,
    "requires_text_search": false,
    "search_term": null,
    "query": "Please find all beers that have the number_apperance 2.5 and return their names and the summary of texts for these beers"
}
Column Selector Printout: Selected columns: string_name, text, number_appearance
Data Extractor Printout:                    string_name  \
0                 Sausa Weizen   
14       Caldera Oatmeal Stout   
15       Caldera Oatmeal Stout   
76            Caldera Pale Ale   
190           Caldera Pale Ale   
200           Caldera Pale Ale   
257           Vas Deferens Ale   
272  Old Growth Imperial Stout   

                                                  text  number_appearance  
0    A lot of foam. But a lot.\tIn the smell some b...        

# Code Refactoring

In [None]:
#Service Classes
class CSVReader:
    """Tool to read CSV files into pandas dataframes."""

    def read_csv(self, file_path: str) -> Tuple[pd.DataFrame, str]:
        """
        Read a CSV file into a pandas dataframe.

        Args:
            file_path: Path to the CSV file

        Returns:
            Tuple of (dataframe, message)
        """
        try:
            df = pd.read_csv(file_path)
            columns_info = ", ".join([f"{col} ({df[col].dtype})" for col in df.columns])
            message = f"Successfully loaded CSV with {len(df)} rows and {len(df.columns)} columns: {columns_info}"
            logger.info(message)
            return df, message
        except Exception as e:
            error_msg = f"Error reading CSV file: {str(e)}"
            logger.error(error_msg)
            return pd.DataFrame(), error_msg

class Workflow:
    """
    Tools to analyze queries and determine information needs;
          to determine which columns are needed for a query;
          to extract relevant data from the dataframe;
          to generate and search text embeddings;
          to generate answers using Claude 3.7."""

    def __init__(self, api_key: Optional[str] = None, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize with Anthropic API key.

        Args:
            api_key: Anthropic API key (will use environment variable if None)

        Initialize the embedding model.

        Args:
            model_name: Name of the embedding model

        """

        self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")

        #if not self.api_key:
            #logger.warning("No Anthropic API key provided. Set ANTHROPIC_API_KEY environment variable.")

        self.client = anthropic.Anthropic(api_key=self.api_key)
        self.model = SentenceTransformer(model_name)


    def analyze_query(query: str, columns: List[str], api_key = api_key) -> Dict[str, Any]:
            """
            Analyze the query to determine information needs.

            Args:
                query: The user's query
                columns: List of available columns in the dataframe

            Returns:
                Dict with analysis results
            """
            try:
                prompt = f"""
                <columns>
                {', '.join(columns)}
                </columns>

                You are an AI assistant that analyzes queries about a dataset. Based on the user query, determine:
                1. Which columns from the dataset are needed to answer the query
                2. What type of analysis is required (filtering, aggregation, etc.)
                3. Whether any specific values or conditions are mentioned

                User query: {query}

                Respond in JSON format like this:
                {{
                    "needed_columns": ["column1", "column2"],
                    "analysis_type": "one of: filtering, aggregation, sorting, comparison, general_info, semantic_search",
                    "filter_conditions": {{"column_name": "filter_value"}},
                    "aggregation_function": "one of: count, sum, average, min, max, none",
                    "sort_by": "column_name or null",
                    "sort_order": "ascending or descending or null",
                    "requires_text_search": true/false,
                    "search_term": "term to search for in text or null"
                    "query": "original query"
                }}

                Make sure all column names exactly match the provided list. If a column is not mentioned or needed, don't include it.
                """
                client = anthropic.Anthropic(api_key)
                response = self.client.messages.create(
                    model="claude-3-haiku-20240307",
                    max_tokens=1000,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )

                analysis_text = response.content[0].text
                print(f'query_analyzer printout: response:{analysis_text}')

                # Extract JSON from response
                import json
                import re

                json_match = re.search(r'{[\s\S]+}', analysis_text)
                if json_match:
                    analysis = json.loads(json_match.group(0))
                    logger.info(f"Query analysis completed: {str(analysis)}")
                    return analysis
                else:
                    logger.error("Failed to extract JSON from Claude's response")
                    return {
                        "needed_columns": columns,
                        "analysis_type": "general_info",
                        "requires_text_search": False
                    }

            except Exception as e:
                logger.error(f"Error analyzing query: {str(e)}")
                return {
                    "needed_columns": columns,
                    "analysis_type": "general_info",
                    "requires_text_search": False
                }

    def select_columns(df: pd.DataFrame, analysis: Dict[str, Any]) -> List[str]:
            """
            Select columns needed to answer the query.

            Args:
                df: The dataframe
                analysis: Query analysis results

            Returns:
                List of column names to use
            """
            all_columns = df.columns.tolist()

            # Start with columns specified in the analysis
            needed_columns = analysis.get("needed_columns", [])

            # Always include text column if text search is required
            if analysis.get("requires_text_search", False) and "text" in all_columns:
                if "text" not in needed_columns:
                    needed_columns.append("text")

            # Add filter columns if not already included
            filter_conditions = analysis.get("filter_conditions", {})
            for col in filter_conditions.keys():
                if col in all_columns and col not in needed_columns:
                    needed_columns.append(col)

            # Add sort column if not already included
            sort_by = analysis.get("sort_by")
            if sort_by and sort_by in all_columns and sort_by not in needed_columns:
                needed_columns.append(sort_by)

            # If no columns were determined, return all columns
            if not needed_columns:
                logger.warning("No specific columns determined, using all columns")
                needed_columns = all_columns

            logger.info(f"Selected columns: {', '.join(needed_columns)}")
            print(f"Column Selector Printout: Selected columns: {', '.join(needed_columns)}")
            return needed_columns

    def extract_data(df: pd.DataFrame, analysis: Dict[str, Any], selected_columns: List[str]) -> pd.DataFrame:
            """
            Extract relevant data based on query analysis.

            Args:
                df: The dataframe
                analysis: Query analysis results
                selected_columns: Columns to include

            Returns:
                Filtered dataframe
            """
            try:
                # Start with selected columns
                result_df = df[selected_columns].copy()

                # Apply filtering if specified
                filter_conditions = analysis.get("filter_conditions", {})
                if filter_conditions and analysis.get("analysis_type") in ["filtering", "comparison"]:
                    for col, value in filter_conditions.items():
                        if col in df.columns:
                            # Handle different filter types
                            if isinstance(value, dict):
                                # Range or comparison filter
                                if "min" in value and "max" in value:
                                    result_df = result_df[(result_df[col] >= value["min"]) &
                                                        (result_df[col] <= value["max"])]
                                elif "min" in value:
                                    result_df = result_df[result_df[col] >= value["min"]]
                                elif "max" in value:
                                    result_df = result_df[result_df[col] <= value["max"]]
                                elif "not_equal" in value:
                                    result_df = result_df[result_df[col] != value["not_equal"]]
                            elif isinstance(value, list):
                                # List of values
                                result_df = result_df[result_df[col].isin(value)]
                            else:
                                # Simple equality
                                result_df = result_df[result_df[col] == value]

                # Apply sorting if specified
                sort_by = analysis.get("sort_by")
                sort_order = analysis.get("sort_order", "ascending")
                if sort_by and sort_by in result_df.columns:
                    ascending = sort_order.lower() != "descending"
                    result_df = result_df.sort_values(by=sort_by, ascending=ascending)

                # Apply aggregation if specified
                agg_function = analysis.get("aggregation_function")
                if agg_function and agg_function != "none" and analysis.get("analysis_type") == "aggregation":
                    # Determine which column to aggregate
                    agg_col = None
                    for col in result_df.columns:
                        if col != "text" and pd.api.types.is_numeric_dtype(result_df[col]):
                            agg_col = col
                            break

                    if agg_col:
                        if agg_function == "count":
                            result_df = result_df.groupby(selected_columns[0])[agg_col].count().reset_index()
                        elif agg_function == "sum":
                            result_df = result_df.groupby(selected_columns[0])[agg_col].sum().reset_index()
                        elif agg_function == "average":
                            result_df = result_df.groupby(selected_columns[0])[agg_col].mean().reset_index()
                        elif agg_function == "min":
                            result_df = result_df.groupby(selected_columns[0])[agg_col].min().reset_index()
                        elif agg_function == "max":
                            result_df = result_df.groupby(selected_columns[0])[agg_col].max().reset_index()

                logger.info(f"Extracted {len(result_df)} rows of data")
                print(f"Data Extractor Printout: {result_df}")
                return result_df

            except Exception as e:
                logger.error(f"Error extracting data: {str(e)}")
                # Return original data with selected columns
                return df[selected_columns].copy()



    def search_similar_texts(df: pd.DataFrame, query: str, model: SentenceTransformer,
                            text_column: str = "text", top_k: int = 5) -> pd.DataFrame:
          """
            Find texts most similar to the query.

            Args:
                df: Dataframe with text column
                query: Search query
                text_column: Column containing text
                top_k: Number of results to return

            Returns:
                Dataframe with most similar texts
            """
            print('I am inside searching simular texts')
            if text_column not in df.columns:
                logger.error(f"Text column '{text_column}' not found in dataframe")
                return df

            try:
                # Generate embeddings
                print('trying to generate embeddings')
                texts = df[text_column].fillna("").tolist()
                print(f'text_embeddings printout: {texts}')
                text_embeddings = self.model.encode(texts)
                print('got the text embeddings!')
                print(f'input query printout: {query} of type {type(query)}')
                query_embedding = self.model.encode(query)
                print('got the query embeddings!')

                # Calculate similarities
                similarities = cosine_similarity(
                    query_embedding.reshape(1, -1),
                    text_embeddings
                )[0]

                # Add similarity scores to dataframe
                result_df = df.copy()
                result_df["similarity_score"] = similarities

                # Sort by similarity and take top_k
                result_df = result_df.sort_values("similarity_score", ascending=False).head(top_k)

                logger.info(f"Found {len(result_df)} similar texts")
                print(f"search_similar_texts printout: {result_df}")
                return result_df

            except Exception as e:
                logger.error(f"Error searching similar texts: {str(e)}")
                print(traceback.format_exc())  # Print full traceback
                return df

    def generate_answer(query: str, data_df: pd.DataFrame, analysis: Dict[str, Any],
                      client: anthropic.Anthropic) -> str:
            """
            Generate an answer using Claude 3.7.

            Args:
                query: User query
                data_df: Dataframe with relevant data
                analysis: Query analysis results

            Returns:
                Generated answer
            """
            try:
                # Convert dataframe to string representation
                data_str = data_df.to_string(index=False) if not data_df.empty else "No data found"

                # Create prompt for Claude
                prompt = f"""
                <data>
                {data_str}
                </data>

                <query_analysis>
                {str(analysis)}
                </query_analysis>

                User query: {query}

                Based on the provided data and analysis of the query, please provide a comprehensive answer to the user's question.
                Include specific details from the data where appropriate. If the data doesn't contain information needed to answer the query,
                state that clearly.

                Answer the query directly and concisely. If appropriate, include any relevant statistics from the data.
                """

                response = self.client.messages.create(
                    model="claude-3-haiku-20240307",
                    max_tokens=2000,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )

                answer = response.content[0].text
                logger.info(f"Generated answer of length {len(answer)}")
                return answer

            except Exception as e:
                logger.error(f"Error generating answer: {str(e)}")
                return f"I encountered an error while generating the answer: {str(e)}"

In [None]:
# Potential new tools:
# 1. Concept Extraction
import yake
# YAKE Config
kw_extractor = yake.KeywordExtractor()
language = 'en'
max_ngram_size = 2
deduplication_threshold = 0.9
numOfKeywords = 50
#get the document corpus (assumes that the text is in the "text" column):

def text_input(file = 'alpha_test.csv'):
  df = pd.read_csv(file)
  df_clean = df[df['text'].apply(lambda x: isinstance(x, str))]
  texts = [item.replace("\t", " ") for item in df_clean['text']]
  return texts


#Keyword for the corpus a.k.a Global Concepts
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(corpus)
#select a number of keywords to work with
def keywords_number(n = len(keywords), input=keywords):
  return [item[0] for item in input[:n]]

In [None]:
#logger = logging.getLogger(__name__)

class CSVAgent_Rfct:
    """Agent that analyzes CSV data to answer queries."""

    def __init__(self, csv_path, api_key=api_key):
        """
        Initialize the agent.

        Args:
            csv_path: Path to the CSV file
            api_key: Anthropic API key (optional)
        """
        self.csv_path = csv_path
        self.api_key = api_key

        #Refactoring Prep:
        self.workflow = Workflow(api_key=self.api_key)

        # Initialize tools
        self.csv_reader = CSVReader()
        self.query_analyzer = QueryAnalyzer(api_key=self.api_key)
        self.column_selector = ColumnSelector()
        self.data_extractor = DataExtractor()
        self.text_embedder = TextEmbedder()
        self.answer_generator = AnswerGenerator(api_key=self.api_key)


        #Recatoring Prep (new class functions)



        # Load CSV data
        self.df, load_message = self.csv_reader.read_csv(csv_path)
        logger.info(load_message)

        # Store column information
        self.columns = list(self.df.columns) if not self.df.empty else []

    def process_query(self, query):
        """
        Process a user query and generate an answer.

        Args:
            query: User query string

        Returns:
            dict: Response containing answer and processing details
        """
        logger.info(f"Processing query: {query}")

        if self.df.empty:
            return {
                "answer": "Unable to analyze the CSV file. Please check the file path and format.",
                "success": False
            }

        try:
            # Step 1: Analyze the query
            analysis = self.workflow.analyze_query(query, self.columns)

            # Step 2: Select relevant columns
            selected_columns = self.workflow.select_columns(self.df, analysis)

            # Step 3: Extract relevant data
            extracted_data = self.workflow.extract_data(self.df, analysis, selected_columns)

            # Step 4: Apply text search if needed
            if analysis.get("requires_text_search", False) and "text" in self.columns:
                if analysis.get("search_term") != None:
                  search_term = analysis.get("search_term")
                else:
                  search_term = analysis.get("query")
                extracted_data = self.workflow.search_similar_texts(
                    extracted_data,
                    search_term,
                    text_column="text",
                    top_k=10
                )

            # Step 5: Generate answer
            answer = self.workflow.generate_answer(query, extracted_data, analysis)

            return {
                "answer": answer,
                "columns_analyzed": selected_columns,
                "rows_analyzed": len(extracted_data),
                "analysis_type": analysis.get("analysis_type", "unknown"),
                "success": True
            }
            '''
            return{
                "success":True
            }
            '''
        except Exception as e:
            logger.error(f"Error processing query: {str(e)}")
            return {
                "answer": f"An error occurred while processing your query: {str(e)}",
                "success": False
            }

In [None]:
%%capture
# Initialize the agent
agent = CSVAgent_Rfct('alpha_test.csv')

# Expermienting with "Thinking" Tools

In [30]:
%%capture
!pip install anthropic

In [15]:
import pandas as pd

In [None]:
import pandas as pd
import openai
from openai import OpenAI
import anthropic

In [5]:
from google.colab import userdata
api_key_openAI = userdata.get('OpenAI')
api_key_anthropic = userdata.get('Antropic')
api_key_gemini = userdata.get('google')

In [16]:
#Dataset to be used:
df = pd.read_csv('Mejuri_texts.csv')
#Text cleanup
def text_input(file = 'alpha_test.csv'):
  df = pd.read_csv(file)
  df_clean = df[df['Text'].apply(lambda x: isinstance(x, str))]
  texts = [item.replace("\t", " ") for item in df_clean['Text']]

  return texts

In [17]:
texts_cleaned = text_input('Mejuri_texts.csv')
corpus = ' '.join(texts_cleaned)

In [31]:
test1 = ' '.join(corpus.split()[:200000])

In [21]:
#Thinking Tool Gemini
import google.generativeai as genai

# Configure the API key
genai.configure(api_key=api_key_gemini)

# Initialize the Gemini 2.0 Flash model
model_gemini = genai.GenerativeModel('gemini-2.0-flash')

generation_config = {
    "temperature": 0.7,  # Controls the randomness of the output (0.0 - 1.0)
    "max_output_tokens": 256,  # Limits the maximum number of tokens in the generated response
    # You can also include other parameters here like top_p, top_k, stop_sequences, etc.
}


def think_then_answer_gemini(question: str, model=model_gemini) -> dict:
    """
    Executes a two-step 'think then answer' reasoning pattern.

    Parameters:
        question (str): The user question.
        model (str): The OpenAI model ID (default uses GPT-4 Turbo).

    Returns:
        dict: A dictionary with 'thoughts' and 'final_answer'.
    """

    # Step 1: Model thinks
    think_prompt = f"""You're a thoughtful assistant. Before answering the user's question, write out your reasoning step by step.
Question: {question}
Your internal thoughts:"""

    thoughts_response = model.generate_content(
        contents=think_prompt,
        generation_config=generation_config
        )
    thoughts = thoughts_response.text

    # Step 2: Model answers using its own thoughts
    final_prompt = f"""Based on your reasoning below, provide a clear and final answer to the user's question.
Reasoning: {thoughts}
Final Answer:"""

    final_response = model.generate_content(
        contents=final_prompt,
        generation_config=generation_config
    )

    final_answer = final_response.text

    return {
        "thoughts": thoughts,
        "final_answer": final_answer
    }

In [32]:
question = "could you provide me a summary of the following text: " + test1
answer = think_then_answer_gemini(question)

In [33]:
answer

{'thoughts': "Okay, I need to summarize a lot of customer feedback about Mejuri stores. Here's how I'll approach it:\n\n1.  **Identify Key Themes:** I'll look for recurring topics in the feedback, both positive and negative. This includes aspects like customer service, product availability, store environment, pricing, and the piercing experience.\n2.  **Quantify (roughly):** I'll note which themes are most frequent to understand what customers are talking about the most.\n3.  **Summarize Positives:** Condense the positive feedback into a few concise sentences.\n4.  **Summarize Negatives:** Condense the negative feedback and areas for improvement into a few concise sentences.\n5.  **Present a Balanced Overview:** Make sure the summary reflects both the strengths and weaknesses mentioned in the text.\n\nHere's the summary:\n\nMejuri customers generally praise the store's aesthetic, the quality and design of the jewelry, and the helpfulness and friendliness of many staff members, highligh

In [36]:
#summarization
question = "could you provide me a summary of the following text: " + corpus
summary_response = model_gemini.generate_content(
    contents=question,
    generation_config=generation_config
)

summary = summary_response.text


In [37]:
summary

"Okay, here's a summary of the provided text focusing on key themes and points:\n\n**Overall Sentiment:**\n\nThe reviews are largely positive, praising the quality of Mejuri jewelry, the store's aesthetic, and the friendliness and helpfulness of the staff. However, there are also recurring criticisms regarding inventory issues (items out of stock), inconsistent customer service (some stylists are great, others are aloof or rude), and inefficiencies in the in-store checkout process. There are also some concerns about the limited selection of certain metals (white gold, silver), and the lack of clear pricing on displayed items.\n\n**Key Positives:**\n\n*   **High-Quality Jewelry:**  Many reviewers consistently praise the quality, beauty, and unique style of Mejuri's jewelry, especially the solid gold pieces and minimalist designs.\n*   **Friendly and Knowledgeable Staff:** A significant number of reviewers highlight positive interactions with specific stylists who were helpful, patient, 

In [26]:
#code example: OpenAI

# Make sure your API key is set in your environment or replace below
openai.api_key = api_key_openAI


client = OpenAI(api_key=openai.api_key)  # or set your key directly

def think_then_answer(question: str, model: str = "gpt-4.1-2025-04-14") -> dict:
    """
    Executes a two-step 'think then answer' reasoning pattern.

    Parameters:
        question (str): The user question.
        model (str): The OpenAI model ID (default uses GPT-4 Turbo).

    Returns:
        dict: A dictionary with 'thoughts' and 'final_answer'.
    """

    # Step 1: Model thinks
    think_prompt = f"""You're a thoughtful assistant. Before answering the user's question, write out your reasoning step by step.
Question: {question}
Your internal thoughts:"""

    thoughts_response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": think_prompt}],
        temperature=0.7
    )
    thoughts = thoughts_response.choices[0].message.content.strip()

    # Step 2: Model answers using its own thoughts
    final_prompt = f"""Based on your reasoning below, provide a clear and final answer to the user's question.
Reasoning: {thoughts}
Final Answer:"""

    final_response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.7
    )
    final_answer = final_response.choices[0].message.content.strip()

    return {
        "thoughts": thoughts,
        "final_answer": final_answer
    }

In [56]:
#code example: Anthropic


client = anthropic.Anthropic(api_key=api_key_anthropic)

def think_then_answer_anthropic(question: str, model: str = "claude-3-7-sonnet-20250219") -> dict:
    """
    Executes a two-step 'think then answer' reasoning pattern.

    Parameters:
        question (str): The user question.
        model (str): The OpenAI model ID (default uses GPT-4 Turbo).

    Returns:
        dict: A dictionary with 'thoughts' and 'final_answer'.
    """

    # Step 1: Model thinks
    think_prompt = f"""You're a thoughtful assistant. Before answering the user's question, write out your reasoning step by step.
Question: {question}
Your internal thoughts:"""

    thoughts_response = client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[{"role": "user", "content": think_prompt}]
    )

    thoughts = thoughts_response.content

    # Step 2: Model answers using its own thoughts
    final_prompt = f"""Based on your reasoning below, provide a clear and final answer to the user's question.
Reasoning: {thoughts}
Final Answer:"""

    final_response = client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.7
    )
    final_answer = final_response.content

    return {
        "thoughts": thoughts,
        "final_answer": final_answer
    }

In [58]:
question = "could you provide me a summary of the following text: " + test1

In [None]:
answer = think_then_answer_anthropic(question)

In [55]:
print(answer['final_answer'][0].text)

Based on the user's detailed feedback about their experiences shopping at Mejuri stores, the key themes and takeaways I would summarize are:

Staff Responsiveness and Customer Service:
The user had mixed experiences with Mejuri's store staff. While some employees were described as very helpful and friendly, the user also encountered less attentive or responsive staff at times. Maintaining a high level of customer service across all store locations appears to be an area for potential improvement.

Product Selection and Inventory Availability:
The user commented on the availability and selection of Mejuri's products, noting both positive experiences finding the items they were looking for, as well as instances where certain products were out of stock or unavailable. Ensuring consistent product inventory and selection could help enhance the overall shopping experience.

Ease of Browsing and Checkout:
The user provided feedback on the in-store browsing and checkout processes. While the che

**Thinking tools code refactoring**

In [None]:
from abc import ABC, abstractmethod

class LLMProvider(ABC):
    def __init__(self, model_name: str, api_key: str, client = None):
        self.client = client
        self.model_name = model_name
        self.api_key = api_key

    @abstractmethod
    def call_model(self, prompt: str) -> object:
        pass

    @abstractmethod
    def extract_text(self, response: object) -> str:
        pass

In [None]:
class OpenAIProvider(LLMProvider):
    def __init__(self, model_name: str, api_key: str):
        super().__init__(client, model_name, api_key, client)

    def call_model(self, prompt: str) -> dict:
        # You can now use self.model_name and self.api_key
        response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7)
        return response

    def extract_text(self, response: dict) -> str:
        return response.choices[0].message.content.strip()

In [None]:
openAI_gpt_4_1 = OpenAIProvider("gpt-4.1-2025-04-14", api_key_openAI, OpenAI(api_key=openai.api_key))

# Raptor Implementation

In [10]:
%%capture
!pip install llama-index-packs-raptor
!pip install llama-index

In [17]:
import pandas as pd
from llama_index.packs.raptor import RaptorPack
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Document

In [14]:
import nest_asyncio

nest_asyncio.apply()

In [33]:
text_list = texts_cleaned[:400]
documents = [Document(text=t) for t in text_list]

In [26]:
import os
os.environ["OPENAI_API_KEY"] = api_key_openAI  # Replace with your actual key

In [None]:
raptor_pack = RaptorPack(
    documents,
    embed_model=OpenAIEmbedding(
        model="text-embedding-3-small"
    ),  # used for embedding clusters
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),  # used for generating summaries
    #vector_store=vector_store,  # used for storage
    similarity_top_k=2,  # top k for each layer, or overall top-k for collapsed
    mode="collapsed",  # sets default mode
    transformations=[
        SentenceSplitter(chunk_size=400, chunk_overlap=50)
    ],  # transformations applied for ingestion
)

In [35]:
nodes = raptor_pack.run("What customers think about Mejuri?", mode="collapsed")
print(len(nodes))
nodes[0].text

2


"Customers have shared their experiences with Mejuri, a jewelry brand, expressing satisfaction with the products and customer service. Many customers appreciate the helpful and kind staff, with some even sharing personal stories and connections with the stylists. However, there have been instances where loyal customers felt uncomfortable due to perceived profiling by security. While some customers praise Mejuri for its inclusivity and accessibility, others have expressed frustration over being asked to wait outside the store, feeling it goes against the brand's image of being relatable and unpretentious. Despite these issues, overall, customers seem to value their interactions with Mejuri and the quality of the jewelry offered."

# Models Connectivity Tests

In [34]:
# Initialize the Anthropic client
client = anthropic.Anthropic(api_key=api_key_anthropic)

try:
    # Make a simple API request
    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=50,
        messages=[{"role": "user", "content": "Hello! Can you confirm you're working?"}]
    )

    # Print the response
    print("Claude's Response:", response.content)

except anthropic.APIStatusError as e:
    print(f"API returned an error: {e}")

except Exception as e:
    print(f"An error occurred: {e}")

Claude's Response: [TextBlock(citations=None, text="Yes, I'm working and ready to assist you! How can I help you today?", type='text')]


**Gemini Model**

In [None]:
import google.generativeai as genai
import os

# Replace with your actual Gemini API key
os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"

# Initialize the Gemini client
client = genai.Client()

# Specify the Gemini 2.0 Flash model
model = client.models.get("gemini-2.0-flash")

# Create a prompt
prompt = "How does AI work?"

# Generate content using the model
response = model.generate_content(prompt)

# Print the response
print(response.text)

In [2]:
%%capture
!pip install --upgrade google-generativeai

In [11]:
import google.generativeai as genai

# Configure the API key
genai.configure(api_key=api_key_gemini) # Replace "YOUR_API_KEY" with your actual API key

# Initialize the Gemini 2.0 Flash model
model = genai.GenerativeModel('gemini-2.0-flash')

generation_config = {
    "temperature": 0.7,  # Controls the randomness of the output (0.0 - 1.0)
    "max_output_tokens": 256,  # Limits the maximum number of tokens in the generated response
    # You can also include other parameters here like top_p, top_k, stop_sequences, etc.
}


# Prepare the prompt for the model
prompt = "Write a short poem about the ocean."

# Generate content using the model
response = model.generate_content(
    contents=prompt,
    #safety_settings=safety_settings,
    generation_config=generation_config
)

# Print the generated text
print(response.text)


Blue giant, breathing deep,
Secrets held in currents sleep.
Salty kiss upon the shore,
Waves that crash and ever roar.

Sunlight dances on the foam,
A wild and watery home.
Mysteries within its heart,
A world forever set apart.

