# **starting with the *experiments***

###installation of the necessary libraries

In [None]:
!pip install langchain_core langgraph  langchain  langchain_groq langchain_community pydantic

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage,AIMessage, BaseMessage
from langchain_groq import ChatGroq
from langchain_community.document_loaders import CSVLoader
from pydantic import BaseModel
from typing_extensions import TypedDict
from langgraph.graph import StateGraph
from langgraph.graph.message import add_messages
from langgraph.graph import START ,END
from langgraph.checkpoint.memory import InMemorySaver
from langgraph.prebuilt import ToolNode
from langchain_core.tools import tool
from langchain.tools import tool_node

In [15]:
data=pd.read_csv("diabetes.csv")

In [17]:
"""
Node 1: The Data Profiler (The "Eyes")

Action: Reads the CSV (using Pandas).

Efficiency: It doesn't send the whole CSV to the LLM. It sends: df.dtypes, df.isnull().sum(), and df.describe().
          Goal: Provide the LLM with enough context to understand the shape of the data.


Node 2: The Visualization Strategist (The "Architect")
Action: This node doesn't write code yet. It reasons.

      Logic: "I see 'Date' and 'Revenue'. I should create a time-series line chart with a rolling average to identify the trend."

Output: A natural language plan for the next node.


Node 3: The Coder & Executor (The "Hands")

    Action: Uses the plan to write Seaborn/Matplotlib code.

Robustness: It attempts to execute the code.

If success: Move to the next node.

If failure: It catches the Exception, adds it to error_log, and loops back to try again.


Node 4: The Insight Reporter (The "Analyst")

       Action: This is the most complex prompt. It looks at the Statistics + the Chart Strategy + the User Question.

Output: A professional Markdown report including:

     The "What": Summary of data.

     The "Why": Why the graph shows a dip or a spike.

     The "So What": Actionable business insights.
"""

'\nNode 1: The Data Profiler (The "Eyes")\n\nAction: Reads the CSV (using Pandas).\n\nEfficiency: It doesn\'t send the whole CSV to the LLM. It sends: df.dtypes, df.isnull().sum(), and df.describe().  \n          Goal: Provide the LLM with enough context to understand the shape of the data.\n\n\nNode 2: The Visualization Strategist (The "Architect")\nAction: This node doesn\'t write code yet. It reasons.\n \n      Logic: "I see \'Date\' and \'Revenue\'. I should create a time-series line chart with a rolling average to identify the trend."\n\nOutput: A natural language plan for the next node.\n\n\nNode 3: The Coder & Executor (The "Hands")\n\n    Action: Uses the plan to write Seaborn/Matplotlib code.\n\nRobustness: It attempts to execute the code.\n\nIf success: Move to the next node.\n  \nIf failure: It catches the Exception, adds it to error_log, and loops back to try again.\n\n\nNode 4: The Insight Reporter (The "Analyst")\n\n       Action: This is the most complex prompt. It looks

In [19]:
from typing import TypedDict, List, Annotated
import pandas as pd
from langgraph.graph.message import add_messages

class AgentState(TypedDict):
    messages: Annotated[list, add_messages]
    csv_path: str
    data_summary: str      # Text summary of the data
    columns: List[str]     # List of column names
    viz_plan: str          # The strategy for the chart
    viz_code: str          # The generated python code
    viz_path: str          # Where the image is saved
    error_log: str         # Any errors from the code execution
    final_report: str      # The final markdown analysis

In [20]:
def profiler_node(state: AgentState):

    df = pd.read_csv(state['csv_path'])

    summary = f"""
    Dataset Shape: {df.shape}
    Columns & Types: {df.dtypes.to_dict()}
    Missing Values: {df.isnull().sum().to_dict()}
    Statistical Summary:
    {df.describe().to_string()}
    """
    return {
        "data_summary": summary,
        "columns": list(df.columns)
    }



In [22]:
llm=ChatGroq(model="openai/gpt-oss-120b" ,api_key=GROQ_API_KEY)

In [23]:
def strategist_node(state: AgentState):
    # Prompt the LLM
    prompt = f"""
    You are a Data Strategy Expert. Based on this data summary:
    {state['data_summary']}

    Identify the most important business trend or correlation.
    Describe a Seaborn visualization that would reveal this insight.
    Output only the plan in one paragraph.
    """
    # Replace with your actual LLM call logic
    response = llm.invoke(prompt)
    return {"viz_plan": response.content}

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns

def coder_executor_node(state: AgentState):
    # Prompt LLM to write code based on state['viz_plan']
    code_prompt = f"""
    Write Python code using Seaborn to create the following chart: {state['viz_plan']}
    - The dataframe is already loaded as 'df'.
    - Use sns.set_theme().
    - Save the plot to 'static/analysis_plot.png' using plt.savefig().
    - Do not use plt.show().
    """
    generated_code = llm.invoke(code_prompt).content

    df = pd.read_csv(state['csv_path'])
    local_vars = {"df": df, "plt": plt, "sns": sns}

    try:
        exec(generated_code, {}, local_vars)
        return {"viz_code": generated_code, "viz_path": "static/analysis_plot.png", "error_log": ""}
    except Exception as e:
        return {"error_log": str(e)}

In [38]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import TypedDict, List, Annotated, Dict
from langgraph.graph import StateGraph, END
from langchain_groq import ChatGroq
from langchain_core.messages import SystemMessage, HumanMessage


class AgentState(TypedDict):
    csv_path: str
    data_summary: str
    analysis_plan: List[Dict[str, str]]
    viz_results: List[Dict[str, str]]
    viz_code: str
    final_report: str
    error_log: str
    retry_count: int


def profiler_node(state: AgentState):
    """Profiles the data with enough detail for deep visualization strategy."""
    df = pd.read_csv(state['csv_path'])
    summary = f"""
    SHAPE: {df.shape}
    COLUMNS: {list(df.columns)}
    NUMERIC COLS: {list(df.select_dtypes(include=['number']).columns)}
    CATEGORICAL COLS: {list(df.select_dtypes(include=['object']).columns)}
    STATS: \n{df.describe().to_string()}
    """
    return {
        "data_summary": summary,
        "analysis_plan": [],
        "viz_results": [],
        "viz_code": "",
        "final_report": "",
        "error_log": "",
        "retry_count": 0
    }

def strategist_node(state: AgentState):
    """The 'Brain' - Demands a full suite of EDA plots."""
    llm = ChatGroq(model="openai/gpt-oss-120b", api_key=GROQ_API_KEY)

    system_msg = """You are a Lead Data Analyst. Your goal is to replace a human analyst.
    Plan 7-10 distinct visualizations to explore this dataset fully.
    You MUST include at least one of each:
    - Distribution (Histogram/KDE)
    - Correlation (Scatter/Relational/Heatmap)
    - Comparison (Bar/Count)
    - Composition (Boxplot/Violin)

    Output ONLY a JSON list: [{"title": "...", "plot_type": "...", "columns": [], "insight_goal": "..."}]"""

    response = llm.invoke([SystemMessage(content=system_msg), HumanMessage(content=state['data_summary'])])
    clean_json = response.content.replace("```json", "").replace("```", "").strip()

    return {"analysis_plan": json.loads(clean_json)}

def coder_node(state: AgentState):
    """The 'Programmer' - Writes the multi-plot Seaborn script."""
    llm = ChatGroq(model="openai/gpt-oss-120b", api_key=GROQ_API_KEY)

    tasks = json.dumps(state['analysis_plan'])
    error_context = f"\nFIX PREVIOUS ERROR: {state['error_log']}" if state['error_log'] else ""

    system_msg = f"""Write Python code using Seaborn/Matplotlib.
    - Dataframe is 'df'.
    - Generate {len(state['analysis_plan'])} separate plots.
    - Save each as 'static/plot_n.png'.
    - Use plt.close('all') after each save.
    - Tasks: {tasks}
    {error_context}
    OUTPUT ONLY RAW PYTHON CODE."""

    response = llm.invoke([SystemMessage(content=system_msg), HumanMessage(content="Write the EDA script.")])

    return {"viz_code": response.content.replace("```python", "").replace("```", "").strip()}


def executor_node(state: AgentState):
    """The 'Engine' - Runs the code and prepares the visual evidence."""
    df = pd.read_csv(state['csv_path'])
    os.makedirs("static", exist_ok=True)
    local_vars = {"df": df, "plt": plt, "sns": sns, "pd": pd}

    try:
        plt.close('all')
        exec(state['viz_code'], {}, local_vars)

        results = []
        for i, task in enumerate(state['analysis_plan']):
            path = f"static/plot_{i}.png"
            if os.path.exists(path):
                # FIXED: Logic to use task goal
                results.append({"title": task['title'], "path": path, "goal": task.get('insight_goal', '')})

        return {"viz_results": results, "error_log": "", "retry_count": state['retry_count'] + 1}
    except Exception as e:
        return {"error_log": str(e), "retry_count": state['retry_count'] + 1}

def analyst_node(state: AgentState):
    """The 'Reporter' - The human-replacement storyteller."""
    llm = ChatGroq(model="openai/gpt-oss-120b", api_key=GROQ_API_KEY)

    viz_meta = json.dumps(state['viz_results'])
    system_msg = """You are a World-Class Data Journalist.
    You have a full suite of visualizations. For EVERY plot provided:
    1. Give it a clear Header.
    2. Explain the visual evidence.
    3. Identify the 'Underlying Story'.
    4. Provide a concrete business recommendation."""

    prompt = f"Data Summary: {state['data_summary']}\nPlots Created: {viz_meta}"
    response = llm.invoke([SystemMessage(content=system_msg), HumanMessage(content=prompt)])

    return {"final_report": response.content}


def router(state: AgentState):
    if state['error_log'] and state['retry_count'] < 3:
        return "coder"
    return "analyst"

workflow = StateGraph(AgentState)
workflow.add_node("profiler", profiler_node)
workflow.add_node("strategist", strategist_node)
workflow.add_node("coder", coder_node)
workflow.add_node("executor", executor_node)
workflow.add_node("analyst", analyst_node)

workflow.set_entry_point("profiler")
workflow.add_edge("profiler", "strategist")
workflow.add_edge("strategist", "coder")
workflow.add_edge("coder", "executor")

workflow.add_conditional_edges(
    "executor",
    router,
    {
        "coder": "coder",
        "analyst": "analyst"
    }
)

workflow.add_edge("analyst", END)


app = workflow.compile()

In [39]:
inputs = {"csv_path": "diabetes.csv"}
result = app.invoke(inputs)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

In [45]:
print(result['final_report'])

## 1Ô∏è‚É£ Kernel Density Estimate of BMI  
**Header:** *How is Body‚ÄëMass‚ÄëIndex (BMI) distributed across the cohort?*  

**What the plot shows**  
- A smooth, unimodal curve that peaks around **‚âà‚ÄØ31‚ÄØkg/m¬≤** (the mode).  
- The left‚Äëhand tail extends to **‚âà‚ÄØ0**, reflecting implausibly low recorded BMIs (data‚Äëentry errors or missing values coded as 0).  
- The right‚Äëhand tail stretches out to **‚âà‚ÄØ67‚ÄØkg/m¬≤**, indicating a small group of severely obese patients.  
- The distribution is **right‚Äëskewed** (longer high‚ÄëBMI tail) rather than perfectly normal.

**Underlying story**  
The majority of patients sit in the **overweight/obese range (BMI‚ÄØ>‚ÄØ30)** ‚Äì a known risk factor for type‚Äë2 diabetes. The presence of many zeroes suggests a data‚Äëquality issue that could bias any model that treats BMI as continuous.

**Business recommendation**  
- **Clean the BMI column**: replace 0 values with `NaN` and consider imputation (e.g., median of similar age/sex g