In [1]:
!pip install groq langgraph langchain pandas matplotlib seaborn python-dotenv



In [6]:
import os
from dotenv import load_dotenv
load_dotenv()
# Optionally set the environment variable in the process if found in .env
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")


In [7]:
from groq import Groq
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from langgraph.graph import StateGraph, END

  from scipy.stats import gaussian_kde


In [8]:
pip install groq numpy pytz seaborn langgraph


Note: you may need to restart the kernel to use updated packages.


## Data parser node


In [9]:
import pandas as pd 
import seaborn as sns

df = sns.load_dataset("tips")
df.head(10)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [10]:
def data_parser_agent(df):
    """Analyze the data and returns the basic summmary"""
    info = {
        "num_rows": len(df),
        "num_columns": len(df.columns),
        "columns": list(df.columns),
        "missing_values":df.isnull().sum().to_dict(),
        "dtypes": df.dtypes.astype(str).to_dict(),
        "summmary_stats": df.describe().to_dict()
    }
    return info 
 

In [11]:
summary = data_parser_agent(df)
summary

{'num_rows': 244,
 'num_columns': 7,
 'columns': ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'],
 'missing_values': {'total_bill': 0,
  'tip': 0,
  'sex': 0,
  'smoker': 0,
  'day': 0,
  'time': 0,
  'size': 0},
 'dtypes': {'total_bill': 'float64',
  'tip': 'float64',
  'sex': 'category',
  'smoker': 'category',
  'day': 'category',
  'time': 'category',
  'size': 'int64'},
 'summmary_stats': {'total_bill': {'count': 244.0,
   'mean': 19.78594262295082,
   'std': 8.902411954856856,
   'min': 3.07,
   '25%': 13.3475,
   '50%': 17.795,
   '75%': 24.127499999999998,
   'max': 50.81},
  'tip': {'count': 244.0,
   'mean': 2.99827868852459,
   'std': 1.3836381890011822,
   'min': 1.0,
   '25%': 2.0,
   '50%': 2.9,
   '75%': 3.5625,
   'max': 10.0},
  'size': {'count': 244.0,
   'mean': 2.569672131147541,
   'std': 0.9510998047322344,
   'min': 1.0,
   '25%': 2.0,
   '50%': 2.0,
   '75%': 3.0,
   'max': 6.0}}}

## Insights generator node

In [12]:
import numpy as np

def statistical_insights(df):
    insights = {}

    # 1. correlation
    num_df = df.select_dtypes(include=np.number)
    if not num_df.empty:
        corr = num_df.corr()
        top_corr = (
            corr.unstack()
            .sort_values(ascending=False)
            .drop_duplicates()
            .head(5)
            .to_dict()
        )
        insights["top_correlations"] = top_corr
    else: 
        insights["top_correlations"] = {}

    # 2. missing values percentage
    missing_pct = (df.isnull().sum() / len(df) * 100).to_dict()
    insights["missing_percentage"] = missing_pct  

    # 3.basic numeric stats
    insights["numeric_summary"] = df.describe().to_dict()
    return insights      


In [13]:
stat_summary = statistical_insights(df)
stat_summary


{'top_correlations': {('total_bill', 'total_bill'): 1.0,
  ('tip', 'total_bill'): 0.6757341092113648,
  ('total_bill', 'size'): 0.5983151309049014,
  ('tip', 'size'): 0.48929877523035714},
 'missing_percentage': {'total_bill': 0.0,
  'tip': 0.0,
  'sex': 0.0,
  'smoker': 0.0,
  'day': 0.0,
  'time': 0.0,
  'size': 0.0},
 'numeric_summary': {'total_bill': {'count': 244.0,
   'mean': 19.78594262295082,
   'std': 8.902411954856856,
   'min': 3.07,
   '25%': 13.3475,
   '50%': 17.795,
   '75%': 24.127499999999998,
   'max': 50.81},
  'tip': {'count': 244.0,
   'mean': 2.99827868852459,
   'std': 1.3836381890011822,
   'min': 1.0,
   '25%': 2.0,
   '50%': 2.9,
   '75%': 3.5625,
   'max': 10.0},
  'size': {'count': 244.0,
   'mean': 2.569672131147541,
   'std': 0.9510998047322344,
   'min': 1.0,
   '25%': 2.0,
   '50%': 2.0,
   '75%': 3.0,
   'max': 6.0}}}

In [14]:
from groq import Groq
import os

client = Groq(api_key=os.environ["GROQ_API_KEY"])

def llm_generate_insights(stat_summary):
    prompt = f"""
    You are a data analysis expert. Analyze the following summary statistics and provide clear,
    human-readable insights about the data:

    {stat_summary}

    Focus on trends, strong correlations, and any potential data quality issues.
    """

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": "You are a helpful data analyst."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

insights_text = llm_generate_insights(stat_summary)
print(insights_text)


**Data Analysis Insights:**

**Missing Data Analysis:**  
No missing data issues were found in the provided dataset. All columns ('total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size') have a missing percentage of 0.0.

**Numeric Summary Statistics:**

1. **Total Bill ('total_bill'):**
	- Mean: $19.79
	- Standard Deviation (SD): $8.90
	- Median: $17.79
	- 25th Percentile (Q1): $13.35
	- 75th Percentile (Q3): $24.13
	- Minimum: $3.07
	- Maximum: $50.81

	There are some potential outliers at the high-end of the total bill distribution, but not to a concerning extent.

2. **Tip ('tip'):**
	- Mean: $3.00
	- Standard Deviation (SD): $1.38
	- Median: $2.90
	- 25th Percentile (Q1): $2.00
	- 75th Percentile (Q3): $3.56
	- Minimum: $1.00
	- Maximum: $10.00

	There are some potential outliers at the high-end of the tip distribution, which warrants further investigation.

3. **Table Size ('size'):**
	- Mean: 2.57
	- Standard Deviation (SD): 0.95
	- Median: 2.0
	- 25th Percentile (Q1): 2.0
	

## Visualization node

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import io
import base64

def create_visualizations_node(data):
    # Convert to DataFrame if not already
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)

    numeric_cols = data.select_dtypes(include=["float64", "int64"]).columns.tolist()

    visualizations = {}

    # --- 1. Correlation Heatmap ---
    plt.figure(figsize=(8, 6))
    sns.heatmap(data[numeric_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap")
    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    visualizations["correlation_heatmap"] = base64.b64encode(buf.read()).decode("utf-8")
    plt.close()

    # --- 2. Distribution Plots ---
    for col in numeric_cols:
        plt.figure(figsize=(6, 4))
        sns.histplot(data[col], kde=True)
        plt.title(f"Distribution of {col}")
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        buf.seek(0)
        visualizations[f"{col}_distribution"] = base64.b64encode(buf.read()).decode("utf-8")
        plt.close()

    # --- 3. Pairplot (optional if small dataset) ---
    if len(numeric_cols) <= 5:
        sns.pairplot(data[numeric_cols])
        plt.suptitle("Pairwise Relationships", y=1.02)
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        buf.seek(0)
        visualizations["pairplot"] = base64.b64encode(buf.read()).decode("utf-8")
        plt.close()

    return {
        "message": "Visualizations created successfully!",
        "visualizations": visualizations
    }

print(create_visualizations_node(df))

{'message': 'Visualizations created successfully!', 'visualizations': {'correlation_heatmap': 'iVBORw0KGgoAAAANSUhEUgAAAyAAAAJYCAYAAACadoJwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABWLUlEQVR4nO3deVxU9f7H8fcMywAqoKLgQuCW+1KYimZWWmaWmd2y5aaSmpVbUb+MNG2xuN3KtNyyXFpuN1vN0jRzK9NwS1Nz39ACBBUUULY5vz+8To1gqThnOPh6Ph7n8XC+8z1fPmc6D+Izn+/3e2yGYRgCAAAAABPYvR0AAAAAgEsHCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQgAAAAA05CAAAAAADANCQiAcmPWrFmy2Wzat2/fRRtz3759stlsmjVr1kUbEwCASxkJCIC/tHv3bg0aNEh169ZVQECAgoOD1aFDB

In [20]:
import base64
from PIL import Image
from io import BytesIO

def show_image(base64_string):
    img = Image.open(BytesIO(base64.b64decode(base64_string)))
    img.show()
print(show_image(create_visualizations_node(df)["visualizations"]["correlation_heatmap"]))    

None


In [21]:
pip install fpdf pillow


Note: you may need to restart the kernel to use updated packages.


## Report generator Node

In [16]:
from fpdf import FPDF
import base64
from io import BytesIO
from PIL import Image

def report_generator_node(summary_text, statistical_insights, visualizations):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, "Automated Data Analysis Report", ln=True, align="C")

    # --- Section 1: Summary ---
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "Summary Overview", ln=True)
    pdf.set_font("Arial", "", 12)
    pdf.multi_cell(0, 8, summary_text)
    pdf.ln(5)

    # --- Section 2: Statistical Insights ---
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "Statistical Insights", ln=True)
    pdf.set_font("Arial", "", 12)
    pdf.multi_cell(0, 8, statistical_insights)
    pdf.ln(5)

    # --- Section 3: Visualizations ---
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "Visualizations", ln=True)
    pdf.set_font("Arial", "", 12)

    for title, b64_img in visualizations.items():
        image_data = base64.b64decode(b64_img)
        image = Image.open(BytesIO(image_data))
        img_path = f"{title}.png"
        image.save(img_path)
        pdf.image(img_path, x=10, w=180)
        pdf.ln(5)

    output_path = "data_analysis_report.pdf"
    pdf.output(output_path)

    return {
        "message": "Report generated successfully!",
        "report_path": output_path
    }


In [24]:
summary_text = """The provided summary statistics suggest a well-structured dataset..."""
statistical_insights = """There is a strong positive correlation between tip and total bill (0.67)..."""

# If you already ran the visualization node, use its output dictionary
# For testing, you can create a dummy one like this:
visualizations = create_visualizations_node(df)["visualizations"]



In [25]:
result = report_generator_node(summary_text, statistical_insights, visualizations)
print(result)

{'message': 'Report generated successfully!', 'report_path': 'data_analysis_report.pdf'}
