### Dependencies

In [39]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

from typing import Annotated, TypedDict, List, Dict, Any, Optional
from pydantic import BaseModel, Field

In [12]:
# from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

In [36]:
from langgraph.graph import StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition   # used by a Node to decide whether to use a tool
from langchain.agents import Tool # required to conver function to tool

from langgraph.checkpoint.memory import MemorySaver

In [16]:
load_dotenv()

True

### Test LangChain OpenAi Integration

OPENAI API USAGE

In [23]:
# # Create an LLM instance
# llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# # Make a simple inference
# user_prompt = """
# What is the capital of the US

# """
# messages = [ HumanMessage(content=user_prompt), 
#              SystemMessage(content="Make sure to present your response in bullet point without markdown format")
#            ]

# response = llm.invoke(messages)

# print(response.content)

GOOGLE GENERATIVE API USAGE

In [42]:
gemini_api_key = os.getenv("GEMINI_API_KEY")

In [66]:
user_prompt = """
What is the capital of the US

"""

messages = [ HumanMessage(content=user_prompt), 
             SystemMessage(content="Make sure to present your response in bullet point without markdown format and extra character.")
           ]


google_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    google_api_key=gemini_api_key
)

print(google_llm.invoke(messages).content)

The capital of the US is **Washington, D.C.**


### Assistant Evaluator

In [72]:
class EvaluatorResponse(BaseModel):
    feedback: str = Field(description="critical feedback on the assistant response")
    success_criteria_met: bool = Field(description="Whether the success criteria has been met")# the users request
    user_input_needed: bool = Field(description="True if more input is needed from the user for more clarity of the LLM get stucked")

In [68]:
evaluator_llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash", 
                                       temperature=0, 
                                       google_api_key=gemini_api_key)

evaluator_llm_with_output = evaluator_llm.with_structured_output(EvaluatorResponse)

In [69]:
evaluator_llm_with_output.invoke(messages)

EvaluatorResponse(feedback='The assistant should directly answer the question about the capital of the US.', success_criteria_met=False, user_input_needed=False, reason='The model did not answer the question directly.')

### Data

In [27]:
df = pd.read_csv("Customer.csv")

df.head()

Unnamed: 0,customer_Id,DOB,Gender,city_code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


### Customised Tools (Node)

This tool can be used as a node in a graph

In [28]:
def data_summary(df: pd.DataFrame) -> str:
    """
    Returns a summary of the dataset including:
    - Shape (rows, columns)
    - Column names and data types
    - Count of missing values per column
    - Basic statistics for numeric columns
    - Top unique values for categorical columns
    """
    df = df.copy()

    summary = []
    summary.append(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.\n")

    # Column info
    col_info = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "missing_values": df.isnull().sum(),
        "non_null_count": df.notnull().sum()
    })
    summary.append("Column Information:\n")
    summary.append(col_info.to_string())
    summary.append("\n")

    # Numeric stats
    numeric_desc = df.describe(include=[float, int]).transpose()
    summary.append("Numeric Column Statistics:\n")
    summary.append(numeric_desc.to_string())
    summary.append("\n")

    # Categorical stats
    cat_desc = df.describe(include=[object, "category"]).transpose()
    if not cat_desc.empty:
        summary.append("Categorical Column Summary:\n")
        summary.append(cat_desc.to_string())
        summary.append("\n")

    return "\n".join(summary)


print(data_summary(df))


Dataset contains 5647 rows and 4 columns.

Column Information:

               dtype  missing_values  non_null_count
customer_Id    int64               0            5647
DOB           object               0            5647
Gender        object               2            5645
city_code    float64               2            5645


Numeric Column Statistics:

              count           mean          std       min       25%       50%       75%       max
customer_Id  5647.0  271037.281034  2451.261711  266783.0  268912.0  271028.0  273180.0  275265.0
city_code    5645.0       5.472631     2.859918       1.0       3.0       5.0       8.0      10.0


Categorical Column Summary:

       count unique         top  freq
DOB     5647   4056  27-12-1988     7
Gender  5645      2           M  2892




In [29]:
tool_summary = Tool(name="summarize_data", 
                    func=data_summary, 
                    description="Returns the summary of uploaded data including statistics for numerical and non-numerical fields.")

### Agents and List of Tools

In [30]:
tools = [tool_summary]

In [31]:
# agent_llm =  ChatOpenAI(model="gpt-4o-mini", temperature=0)
google_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    google_api_key=os.getenv("GEMINI_API_KEY")
)
# bind tools to LLM
google_llm__with_tools = google_llm.bind_tools(tools)

### Nodes

In [None]:
def worker(state: State) -> Dict[str, Any]:
    system_message = f"""
            You are a data analyst assistant that can use tools to complete analysis tasks.
            
            You specialize in:
            - Return summary of the dataset (Statistics, data description)
            - Exploring datasets (especially CSV files provided by the user)
            - Answering business or technical questions from the data
            - Explaining insights clearly in plain language
            - Asking clarifying questions if the task is ambiguous
            
            You keep working on a task until either:
            1. You have a question or clarification for the user, OR
            2. The success criteria is met.
            
            This is the success criteria:
            {state['success_criteria']}

            You should reply either with:
            - A question for the user about this assignment (if more context or data is needed), OR
            - Your final analysis/answer if you have enough information.
            
            If you have a question for the user, reply by clearly stating it. For example:
            Question: please provide the CSV file containing sales data for analysis.
            
            If you have finished, reply only with the final analysis/answer — do not ask a question.
    """

    # Add rejection feedback (if any)
    if state.get("feedback_on_work"):
        system_message += f"""
            Previously, you thought you completed the assignment, but your reply was rejected because the success criteria was not met.
            Here is the feedback on why this was rejected:
            {state['feedback_on_work']}
            With this feedback, please continue the assignment, ensuring that you meet the success criteria or ask a question to the user if 
            the data is missing.
        """
    
    found_system_message = False
    messages = state["messages"]
    for message in messages:
        if isinstance(message, SystemMessage):
            message.content = system_message
            found_system_message = True

    # prepend system message if it does not already exists
    if not found_system_message:
        messages = [SystemMessage(content=system_message)] + messages
    
    # Invoke the LLM with tools
    response = worker_llm_with_tools.invoke(messages)
    
    # Return updated state
    return {
        "messages": [response],
    }

### State Graph

In [70]:
class State(TypedDict):
    # blueprint for a dictionary with a list of any object. But here I paln to use dictionary since it is for llm
    # something like {"role": "user", "content": "Capital of Nig?"}
    # The messages uses the reducer. 
    # The others are simply values that we overwrite with any state change based on the evaluator LLM.
    messages: Annotated[list, add_messages]
    success_criteria: str
    feedback_on_work: Optional[str]
    success_criteria_met: bool
    user_input_needed: bool
    

In [71]:
graph_builder = StateGraph(State)