<a href="https://colab.research.google.com/github/mdey26/DataSense_AI_Agent/blob/main/DataSense_AI_Agent_Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys

# If the notebook is run from a repo clone in /content or from Drive, set REPO_ROOT appropriately.
# Users can override REPO_ROOT env var if needed.
repo_root = os.getenv("REPO_ROOT", os.path.abspath("."))

# If running in Colab and the repo is cloned to /content/DataSense_AI_Agent,
# users can set REPO_ROOT environment to that path before running the notebook:
# %env REPO_ROOT=/content/DataSense_AI_Agent

# Add repo root to sys.path so "from services..." imports work
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Using repo_root:", repo_root)
# Create a portable OUTPUT_DIR for artifacts
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/tmp/datasense_outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("OUTPUT_DIR:", OUTPUT_DIR)


Sometimes if you get the error while running the One-cell then just Cell 3 and
then restart the session.                                                  
(Runtime -> Restart Session or Ctrl + M )


In [None]:
# üîµ ONE-CELL DEMO with interactive upload (Colab)
from IPython.display import FileLink, display
import time, os, sys
from google.colab import files   # works only in Colab

# --- relative import for repo usage ---
# Explicitly set repo_root to the expected path of the DataSense_AI_Agent repository
repo_root = "/content/DataSense_AI_Agent" # Changed line
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from services.session_service import SessionService
from utils.logger import AgentLogger
from services.memory_bank import MemoryBank
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from agents.anomaly_agent import AnomalyAgent
from agents.reporter_agent import ReporterAgent
from config import Config

# Ensure OUTPUT_DIR exists (if not defined earlier)
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/tmp/datasense_outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("üöÄ Starting ONE-CELL FULL PIPELINE DEMO (Upload or fallback to sample)\n")

# Initialize services
logger = AgentLogger(Config.LOGS_PATH)
session_service = SessionService()
memory = MemoryBank()

print(f"Session created: {session_service.session_id}")
print("Memory Bank initialized.\n")

# --- Upload prompt ---
print("üîΩ Upload a CSV file now (or cancel to use bundled sample):")
uploaded = files.upload()   # opens file picker in Colab

if uploaded:
    # take the first uploaded file
    uploaded_name = list(uploaded.keys())[0]
    saved_path = os.path.join(OUTPUT_DIR, uploaded_name)
    # write bytes to OUTPUT_DIR
    with open(saved_path, "wb") as f:
        f.write(uploaded[uploaded_name])
    sample_file_path = saved_path
    print(f"Uploaded file saved to: {sample_file_path}")
else:
    # fallback to sample included in the repo
    # This path assumes Config.BASE_PATH is correctly defined within the DataSense_AI_Agent project.
    sample_file_path = f"{Config.BASE_PATH}/sample_data/sales.csv"
    print("No file uploaded. Falling back to bundled sample:", sample_file_path)

# Validate path
if not os.path.exists(sample_file_path):
    raise FileNotFoundError(f"Dataset path not found: {sample_file_path}")

# --- Pipeline Steps ---
print("\n============================================================")
print("Step 1: Running Ingest Agent...")
print("============================================================")
start_ingest = time.time()
ingest_agent = IngestAgent(session_service, logger)
# Try to use ingest_csv(path). If your IngestAgent expects a DataFrame, read and call ingest_dataframe.
try:
    profile = ingest_agent.ingest_csv(sample_file_path)
except AttributeError:
    # fallback: read as pandas and call ingest_dataframe if available
    import pandas as pd
    df_local = pd.read_csv(sample_file_path)
    if hasattr(ingest_agent, "ingest_dataframe"):
        profile = ingest_agent.ingest_dataframe(df_local)
    else:
        # as last resort, store df in session and continue
        session_service.store(session_service.session_id, "df", df_local)
        profile = {"filename": os.path.basename(sample_file_path), "quality_metrics": {"completeness_score": None, "overall_quality": None}}
        print("Warning: IngestAgent.ingest_csv not available; stored DataFrame directly in session.")

df = ingest_agent.get_dataframe() if hasattr(ingest_agent, "get_dataframe") else session_service.get(session_service.session_id, "df")
ingest_time = time.time() - start_ingest
print(f"Ingestion done ({ingest_time:.2f}s). Dataset: {profile.get('filename')} ({df.shape[0]} rows)")

print("\n============================================================")
print("Step 2: Running Analysis Agent...")
print("============================================================")
start_analysis = time.time()
analysis_agent_instance = AnalysisAgent(session_service, logger)
analysis_res = analysis_agent_instance.analyze(df)
analysis_time = time.time() - start_analysis
print(f"Analysis done ({analysis_time:.2f}s). Insights: {len(analysis_res.get('insights', []))}, Visualizations: {len(analysis_res.get('visualizations', []))}")

print("\n============================================================")
print("Step 3: Running Anomaly Agent...")
print("============================================================")
start_anomaly = time.time()
anomaly_agent_instance = AnomalyAgent(session_service, logger)
anomaly_res = anomaly_agent_instance.detect_anomalies(df)
anomaly_time = time.time() - start_anomaly
print(f"Anomaly detection done ({anomaly_time:.2f}s). Total Anomalies: {anomaly_res.get('total_anomalies', 0)}")

# Store analysis in memory
analysis_summary = {
    'shape': df.shape,
    'columns': df.columns.tolist(),
    'anomalies_count': anomaly_res.get('total_anomalies', 0),
    'quality_score': profile.get('quality_metrics', {}).get('completeness_score'),
    'key_findings': [
        f"Detected {anomaly_res.get('total_anomalies', 0)} anomalies",
        f"Quality: {profile.get('quality_metrics', {}).get('overall_quality')}",
        f"Generated {len(analysis_res.get('visualizations', []))} visualizations"
    ]
}
memory_id = memory.store_analysis(profile.get('filename'), analysis_summary)
print(f"Analysis stored in Memory Bank with ID: {memory_id}")

print("\n============================================================")
print("Step 4: Compiling Report...")
print("============================================================")
start_report = time.time()
reporter_agent = ReporterAgent(session_service, logger)
report = reporter_agent.generate_report()
report_time = time.time() - start_report
print(f"Report generated ({report_time:.2f}s). File: {report.get('report_filename')}")

# shows download link for convenience
if report.get('report_path') and os.path.exists(report.get('report_path')):
    print(f"\n‚úÖ Full Pipeline Demo Complete! Report available at: {report.get('report_path')}")
    display(FileLink(report.get("report_path")))
else:
    # shows files in OUTPUT_DIR to help debugging
    print("\n‚ùå Report generation failed or report not found. Listing OUTPUT_DIR for debugging:")
    print(os.listdir(OUTPUT_DIR))

print("\n============================================================")
print("TOTAL PIPELINE DURATION: ", round(ingest_time + analysis_time + anomaly_time + report_time, 2), "s")
print("============================================================")


In [15]:
# üîµ ONE-CELL DEMO (robust & portable)
from IPython.display import FileLink, display
import time, os, sys, traceback

# try to import google.colab.files only in Colab
IN_COLAB = False
try:
    from google.colab import files as colab_files
    IN_COLAB = True
except Exception:
    IN_COLAB = False

# Determine repo_root: prefer cloned path, fallback to cwd
repo_root_candidates = [
    "/content/DataSense_AI_Agent",
    os.getcwd()
]
repo_root = None
for p in repo_root_candidates:
    if os.path.isdir(p):
        repo_root = p
        break
if repo_root is None:
    repo_root = os.getcwd()

if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Using repo_root:", repo_root)
print("Python sys.path head:", sys.path[:3])

# Try imports and provide helpful errors if missing
missing = []
try:
    from services.session_service import SessionService
except Exception as e:
    missing.append("services.session_service")

try:
    from utils.logger import AgentLogger
except Exception as e:
    missing.append("utils.logger")

try:
    from services.memory_bank import MemoryBank
except Exception as e:
    missing.append("services.memory_bank")

try:
    from agents.ingest_agent import IngestAgent
except Exception as e:
    missing.append("agents.ingest_agent")

try:
    from agents.analysis_agent import AnalysisAgent
except Exception as e:
    missing.append("agents.analysis_agent")

try:
    from agents.anomaly_agent import AnomalyAgent
except Exception as e:
    missing.append("agents.anomaly_agent")

try:
    from agents.reporter_agent import ReporterAgent
except Exception as e:
    missing.append("agents.reporter_agent")

try:
    from config import Config
except Exception as e:
    missing.append("config (Config)")

if missing:
    print("ERROR: The following modules are missing or failed to import:")
    for m in missing:
        print(" -", m)
    print("\nMake sure the repository is in repo_root and those files exist. Aborting demo.")
    raise ImportError("Missing modules: " + ", ".join(missing))

# Prepare OUTPUT_DIR
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/tmp/datasense_outputs")
OUTPUT_DIR = os.path.abspath(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("OUTPUT_DIR:", OUTPUT_DIR)

print("\nüöÄ Starting ONE-CELL FULL PIPELINE DEMO (Upload or fallback to sample)\n")

# Initialize services
logger = AgentLogger(getattr(Config, "LOGS_PATH", OUTPUT_DIR))
session_service = SessionService()
memory = MemoryBank()

print(f"Session created: {getattr(session_service, 'session_id', 'unknown-session')}")
print("Memory Bank initialized.\n")

# --- Upload prompt (Colab only) ---
sample_file_path = None
if IN_COLAB:
    print("üîΩ Upload a CSV file now (or cancel to use bundled sample):")
    uploaded = colab_files.upload()
    if uploaded:
        uploaded_name = list(uploaded.keys())[0]
        saved_path = os.path.join(OUTPUT_DIR, uploaded_name)
        with open(saved_path, "wb") as f:
            f.write(uploaded[uploaded_name])
        sample_file_path = saved_path
        print(f"Uploaded file saved to: {sample_file_path}")
    else:
        print("No file uploaded in Colab upload dialog.")
else:
    print("Not running in Colab ‚Äî skipping interactive upload. Will use bundled sample if available.")

# Fallback to sample in repo
if not sample_file_path:
    sample_file_path = os.path.join(repo_root, getattr(Config, "BASE_PATH", "sample_data"), "sales.csv")
    print("Using fallback sample path:", sample_file_path)

# Validate dataset path
if not os.path.exists(sample_file_path):
    print("\nERROR: Dataset path not found:", sample_file_path)
    print("Please upload a CSV or ensure sample_data/sales.csv exists in repo.")
    raise FileNotFoundError(sample_file_path)

# --- Pipeline Steps with robust error handling ---
def safe_call(step_name, fn, *args, **kwargs):
    print(f"\n== {step_name} ==")
    start = time.time()
    try:
        res = fn(*args, **kwargs)
        elapsed = time.time() - start
        print(f"{step_name} completed ({elapsed:.2f}s).")
        return res, elapsed
    except Exception as e:
        print(f"ERROR during {step_name}: {e}")
        traceback.print_exc()
        raise

# Step 1: Ingest
ingest_agent = IngestAgent(session_service, logger)
try:
    profile = None
    if hasattr(ingest_agent, "ingest_csv"):
        profile, t_ing = safe_call("Ingest Agent (ingest_csv)", ingest_agent.ingest_csv, sample_file_path)
        # some implementations return just profile, not (profile, time)
        if isinstance(profile, tuple) and len(profile) == 2 and isinstance(profile[1], (int,float)):
            profile = profile[0]
    else:
        # fallback to reading DataFrame and calling ingest_dataframe / storing DF
        import pandas as pd
        df_local = pd.read_csv(sample_file_path)
        if hasattr(ingest_agent, "ingest_dataframe"):
            profile, t_ing = safe_call("Ingest Agent (ingest_dataframe)", ingest_agent.ingest_dataframe, df_local)
        else:
            session_service.store(getattr(session_service,'session_id'), "df", df_local)
            profile = {"filename": os.path.basename(sample_file_path), "quality_metrics": {"completeness_score": None, "overall_quality": None}}
            t_ing = 0.0
            print("Warning: ingest_agent has no ingest_csv/ingest_dataframe; stored DataFrame in session.")
except Exception as e:
    raise RuntimeError("Ingest failed; fix ingest_agent or dataset.") from e

# Get DataFrame
if hasattr(ingest_agent, "get_dataframe"):
    df = ingest_agent.get_dataframe()
else:
    df = session_service.get(getattr(session_service,'session_id'), "df")
if df is None:
    raise RuntimeError("Could not obtain DataFrame from ingest step. Check ingest_agent implementation.")

print(f"Ingestion result: file={profile.get('filename')}, rows={getattr(df, 'shape', ('?', '?'))[0]}")

# Step 2: Analysis
analysis_agent_instance = AnalysisAgent(session_service, logger)
analysis_res, t_analysis = safe_call("Analysis Agent (analyze)", analysis_agent_instance.analyze, df)
insights = analysis_res.get('insights', [])
visuals = analysis_res.get('visualizations', [])

print(f"Insights count: {len(insights)}, Visualizations count: {len(visuals)}")

# Step 3: Anomaly detection
anomaly_agent_instance = AnomalyAgent(session_service, logger)
anomaly_res, t_anom = safe_call("Anomaly Agent (detect_anomalies)", anomaly_agent_instance.detect_anomalies, df)
total_anomalies = anomaly_res.get('total_anomalies', 0)
print(f"Detected anomalies: {total_anomalies}")

# Step 4: Store in Memory
analysis_summary = {
    'shape': getattr(df, "shape", None),
    'columns': list(getattr(df, "columns", [])),
    'anomalies_count': total_anomalies,
    'quality_score': profile.get('quality_metrics', {}).get('completeness_score'),
    'key_findings': [
        f"Detected {total_anomalies} anomalies",
        f"Quality: {profile.get('quality_metrics', {}).get('overall_quality')}",
        f"Generated {len(visuals)} visualizations"
    ]
}
if hasattr(memory, "store_analysis"):
    memory_id = memory.store_analysis(profile.get('filename'), analysis_summary)
    print(f"Analysis stored in Memory Bank with ID: {memory_id}")
else:
    print("MemoryBank.store_analysis not found. Skipping memory storage.")
    memory_id = None

# Step 5: Report
reporter_agent = ReporterAgent(session_service, logger)
report, t_report = None, 0.0
if hasattr(reporter_agent, "generate_report"):
    report, t_report = safe_call("Reporter Agent (generate_report)", reporter_agent.generate_report)
else:
    print("ReporterAgent.generate_report not implemented. Skipping report generation.")

# Show report link if available
if isinstance(report, dict):
    report_path = report.get('report_path') or report.get('report_filename')
    if report_path and os.path.exists(report_path):
        print(f"\n‚úÖ Full Pipeline Demo Complete! Report available at: {report_path}")
        display(FileLink(report_path))
    else:
        # try OUTPUT_DIR listing
        print("\n‚ùå Report path not found in report dict or file missing. OUTPUT_DIR listing for debugging:")
        print(os.listdir(OUTPUT_DIR))
else:
    print("\n‚ùå Reporter did not return a report dict. Reporter output:", report)

total_time = round((t_ing if 't_ing' in locals() else 0) + (t_analysis if 't_analysis' in locals() else 0) + (t_anom if 't_anom' in locals() else 0) + (t_report if 't_report' in locals() else 0), 2)
print("\n============================================================")
print("TOTAL PIPELINE DURATION: ", total_time, "s")
print("============================================================")


Using repo_root: /content
Python sys.path head: ['/content/drive/MyDrive/DataSense_AI_Agent', '/content/DataSense_AI_Agent', '/content']
OUTPUT_DIR: /tmp/datasense_outputs

üöÄ Starting ONE-CELL FULL PIPELINE DEMO (Upload or fallback to sample)

üìå Session 20251203_150940 created
üìö Memory Bank initialized: /content/drive/MyDrive/DataSense_AI_Agent/outputs/memory
Session created: 20251203_150940
Memory Bank initialized.

üîΩ Upload a CSV file now (or cancel to use bundled sample):


  No API_KEY or ADC found. Please either:
    - Set the `GOOGLE_API_KEY` environment variable.
    - Manually pass the key with `genai.configure(api_key=my_api_key)`.
    - Or set up Application Default Credentials, see https://ai.google.dev/gemini-api/docs/oauth for more information.


Saving synthetic_dataset.csv to synthetic_dataset.csv
Uploaded file saved to: /tmp/datasense_outputs/synthetic_dataset.csv
ü§ñ IngestAgent state updated

== Ingest Agent (ingest_csv) ==
ü§ñ IngestAgent state updated

ü§ñ INGEST AGENT STARTING
üìÇ Loading: /tmp/datasense_outputs/synthetic_dataset.csv
‚úì Loaded 4,362 rows √ó 5 columns

üìä Creating dataset profile...
  ‚Ä¢ Analyzing structure...
  ‚Ä¢ Analyzing missing values...
  ‚Ä¢ Analyzing numeric columns...
  ‚Ä¢ Analyzing categorical columns...
  ‚Ä¢ Detecting date columns...
  ‚Ä¢ Creating sample data...
  ‚Ä¢ Computing quality metrics...
üìä Dataset info stored: synthetic_dataset.csv
‚úì Profile created and stored in session

ü§ñ Getting AI insights...
ü§ñ IngestAgent state updated
‚úì Completed in 0.02s

Ingest Agent (ingest_csv) completed (0.02s).
Ingestion result: file=synthetic_dataset.csv, rows=4362
ü§ñ AnalysisAgent state updated

== Analysis Agent (analyze) ==
ü§ñ AnalysisAgent state updated
ü§ñ AnalysisAgent 


TOTAL PIPELINE DURATION:  2.78 s


In [None]:
# Cell 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Cell 2: Setup API Key (SECURE)
from google.colab import userdata

# Now code can use it:
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
print("‚úÖ API Key loaded securely!")

In [None]:
# Cell 3: Install Required Packages

!pip install -q google-generativeai  # Google's Gemini AI
!pip install -q langgraph            # For multi-agent orchestration
!pip install -q pandas               # For data manipulation
!pip install -q matplotlib           # For creating charts
!pip install -q seaborn              # For beautiful charts
!pip install -q fpdf                 # For creating PDF reports
!pip install -q scikit-learn         # For data analysis tools
!pip install -q scipy                # For scientific computing

print("‚úÖ All packages installed!")


In [None]:
# Cell 5: Initialize Gemini
import google.generativeai as genai
from google.colab import userdata

# Load API key from secrets
API_KEY = userdata.get('GEMINI_API_KEY')

# Configure the AI
genai.configure(api_key=API_KEY)

# Test it works
model = genai.GenerativeModel('gemini-2.5-flash')
response = model.generate_content("Say 'Hello! I'm ready to help analyze data!'")
print(response.text)


In [None]:
# Cell 6: Create Sample Sales Dataset
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')
from config import Config

# Set random seed for reproducibility (same data every time)
np.random.seed(42)

# Create date range (Jan 1 to Nov 25, 2024)
dates = pd.date_range(start='2024-01-01', end='2024-11-25', freq='D')
num_days = len(dates)

print(f"Creating {num_days} days of sales data...")

# Create the dataset
sales_data = pd.DataFrame({
    # Date column
    'date': dates,

    # Product IDs (5 different products)
    'product_id': np.random.choice(
        ['P001', 'P002', 'P003', 'P004', 'P005'],
        num_days
    ),

    # Sales amount (normally distributed around $5000)
    # np.random.normal(mean, std_deviation, count)
    'sales_amount': np.random.normal(5000, 1500, num_days),

    # Quantity sold (Poisson distribution, average 50 units)
    'quantity_sold': np.random.poisson(50, num_days),

    # Region (4 regions)
    'region': np.random.choice(
        ['North', 'South', 'East', 'West'],
        num_days
    ),

    # Customer segment
    'customer_segment': np.random.choice(
        ['Retail', 'Wholesale', 'Online'],
        num_days
    )
})

# ‚ñÑ Add realistic anomalies (this is what we'll detect!)
print("Adding anomalies...")

# Pick 15 random days to be anomalies
anomaly_indices = np.random.choice(num_days, 15, replace=False)

# Create spike anomalies (sudden sales increase)
spike_indices = anomaly_indices[:10]
sales_data.loc[spike_indices, 'sales_amount'] *= 3
print(f"  ‚ÜóÔ∏è  Added {len(spike_indices)} spike anomalies (3x normal sales)")

# Create drop anomalies (sudden sales decrease)
drop_indices = anomaly_indices[10:]
sales_data.loc[drop_indices, 'sales_amount'] *= 0.1
print(f"  ‚ÜòÔ∏è  Added {len(drop_indices)} drop anomalies (10% normal sales)")

# Add missing values (realistic data quality issues)
missing_indices = np.random.choice(num_days, 10, replace=False)
sales_data.loc[missing_indices, 'customer_segment'] = np.nan
print(f"  ‚ùì Added {len(missing_indices)} missing values")

# Save to Google Drive
output_path = f'{Config.BASE_PATH}/sample_data/sales.csv'
sales_data.to_csv(output_path, index=False)

print(f"\n‚úÖ Created: sales.csv")
print(f"   Shape: {sales_data.shape[0]} rows √ó {sales_data.shape[1]} columns")
print(f"   Saved to: {output_path}")

# Show preview
print("\nüìä Preview (first 5 rows):")
print(sales_data.head())


In [None]:
# Cell 7: Create Transaction Dataset
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')
from config import Config

print("Creating transaction dataset...")

transactions = pd.DataFrame({
    'transaction_id': [f'T{i:05d}' for i in range(1, 1001)],  # T00001, T00002, ...
    'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='h'),
    'user_id': np.random.randint(1000, 5000, 1000),
    'amount': np.random.exponential(100, 1000),  # Most small, few large
    'payment_method': np.random.choice(
        ['Credit', 'Debit', 'UPI', 'Cash'],
        1000
    ),
    'status': np.random.choice(
        ['Success', 'Failed', 'Pending'],
        1000,
        p=[0.9, 0.08, 0.02]  # 90% success, 8% failed, 2% pending
    )
})

transactions.to_csv(f'{Config.BASE_PATH}/sample_data/transactions.csv', index=False)
print(f"‚úÖ Created: transactions.csv ({transactions.shape[0]} rows)")

In [None]:
# Cell 8: Create Customer Dataset
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')
from config import Config
import pandas as pd
import numpy as np

print("Creating customer dataset...")

customers = pd.DataFrame({
    'customer_id': range(1, 501),
    'age': np.random.randint(18, 75, 500),
    'income': np.random.lognormal(10.5, 0.5, 500),  # Income distribution
    'credit_score': np.random.randint(300, 850, 500),
    'tenure_months': np.random.randint(1, 120, 500),
    'churn': np.random.choice([0, 1], 500, p=[0.85, 0.15])  # 15% churned
})

customers.to_csv(f'{Config.BASE_PATH}/sample_data/customer_data.csv', index=False)
print(f"‚úÖ Created: customer_data.csv ({customers.shape[0]} rows)")

print("\nüéâ All sample datasets created!")

In [None]:
# Cell 9: Test Config
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

from config import Config

print("Testing configuration...")
print(f"‚úì Base path: {Config.BASE_PATH}")
print(f"‚úì Model: {Config.GEMINI_MODEL}")
print(f"‚úì Anomaly threshold: {Config.ANOMALY_THRESHOLD}")
print("\n‚úÖ Config works!")


In [None]:
# Cell 10: Test Logger
from utils.logger import AgentLogger
from config import Config

# Create logger
logger = AgentLogger(Config.LOGS_PATH)

# Simulate agent activity
logger.log_agent_start('TestAgent', {'test': 'data'})
import time
time.sleep(1)  # Simulate work
logger.log_agent_end('TestAgent', {'result': 'success'}, 1.0)

# Simulate tool call
logger.log_tool_call('TestTool', {'param': 'value'}, 'Tool output')

# Print summary
logger.print_summary()


In [None]:
# Cell 11: Test Session Service

# --- relative import for repo usage ---
import os, sys
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService


# Create a session
session = SessionService()

# Simulate Ingest Agent storing dataset info
session.store_dataset_info({
    'filename': 'sales.csv',
    'shape': (330, 6),
    'missing_percentage': {'date': 0, 'sales_amount': 0.5}
})

# Simulate Anomaly Agent finding issues
session.add_anomaly({
    'type': 'spike',
    'column': 'sales_amount',
    'row': 42,
    'value': 15000,
    'severity': 'high'
})

session.add_anomaly({
    'type': 'outlier',
    'column': 'quantity_sold',
    'row': 105,
    'value': 500,
    'severity': 'medium'
})

# Simulate Analysis Agent storing insights
session.add_insight(
    'North region has 2.5x higher average sales',
    'regional_analysis'
)

# Print summary
print("\n" + "="*60)
print("SESSION SUMMARY")
print("="*60)
summary = session.get_session_summary()
for key, value in summary.items():
    print(f"  {key}: {value}")


In [None]:
# Cell 12: Test Agent Communication Protocol
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

from agents import AgentRole, AgentMessage, BaseAgent

# Create a test message
message = AgentMessage(
    sender=AgentRole.INGEST,
    receiver=AgentRole.ANALYSIS,
    message_type="request",
    payload={
        'dataset_name': 'sales.csv',
        'rows': 330,
        'columns': ['date', 'sales', 'region']
    },
    timestamp="2024-11-27T01:23:45",
    session_id="20241127_012345"
)

# Show the message
print("Message created:")
print(message.to_json())

# Convert back from dict
message_dict = message.to_dict()
reconstructed = AgentMessage.from_dict(message_dict)

print("\n‚úÖ Message conversion successful!")
print(f"Sender: {reconstructed.sender.value}")
print(f"Receiver: {reconstructed.receiver.value}")
print(f"Payload: {reconstructed.payload}")


In [None]:
# Cell 13: Test Complete Pipeline

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

from config import Config
from utils.logger import AgentLogger
from agents.ingest_agent import IngestAgent

print("üöÄ Starting Integration Test\n")

# ===== Step 1: Initialize Services =====
print("Step 1: Initializing services...")
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()
print("‚úì Logger and Session created\n")

# ===== Step 2: Create Ingest Agent =====
print("Step 2: Creating Ingest Agent...")
ingest_agent = IngestAgent(session, logger)
print("‚úì Ingest Agent created\n")

# ===== Step 3: Ingest Dataset =====
print("Step 3: Ingesting dataset...")
profile = ingest_agent.ingest_csv(
    f'{Config.BASE_PATH}/sample_data/sales.csv'
)
print("‚úì Dataset ingested\n")

# ===== Step 4: Display Results =====
print("="*60)
print("INGEST AGENT RESULTS")
print("="*60)

print(f"\nüìä Dataset: {profile['filename']}")
print(f"üìè Shape: {profile['shape'][0]:,} rows √ó {profile['shape'][1]} columns")
print(f"üíæ Memory: {profile['memory_usage_mb']} MB")

print(f"\n‚úÖ Quality: {profile['quality_metrics']['overall_quality']}")
print(f"üìà Completeness: {profile['quality_metrics']['completeness_score']}%")
print(f"üîÑ Duplicates: {profile['quality_metrics']['duplicate_rows']}")

print(f"\nüìã Columns: {', '.join(profile['columns'])}")
print(f"üìÖ Date Columns: {', '.join(profile['date_columns'])}")

print(f"\nü§ñ AI Analysis:")
print(f"   Type: {profile['llm_insights']['dataset_type']}")
print(f"   Recommended: {', '.join(profile['llm_insights']['recommended_analysis'][:2])}")

if profile['llm_insights']['potential_issues']:
    print(f"   Issues: {', '.join(profile['llm_insights']['potential_issues'][:2])}")

# ===== Step 5: Show Session State =====
print("\n" + "="*60)
print("SESSION STATE")
print("="*60)

summary = session.get_session_summary()
print(f"\n Session ID: {summary['session_id']}")
print(f"   Duration: {summary['duration_seconds']:.2f}s")
print(f"   Dataset: {summary['dataset_name']}")
print(f"   Agents: {', '.join(summary['agents_executed'])}")

# ===== Step 6: Show Metrics =====
print("\n" + "="*60)
print("EXECUTION METRICS")
print("="*60)

metrics = logger.get_metrics_summary()
logger.print_summary()

print("\n‚úÖ Integration Test Complete!")


In [None]:
# Cell 14: Test Code Executor
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

import pandas as pd
import numpy as np
from utils.logger import AgentLogger
from tools.code_executor import CodeExecutor
from config import Config

# Create logger
logger = AgentLogger(Config.LOGS_PATH)

# Create executor
executor = CodeExecutor(logger)

# Create sample dataframe
df = pd.DataFrame({
    'Product': ['A', 'B', 'C', 'A', 'B'],
    'Sales': [1000, 1500, 2000, 1200, 1800],
    'Region': ['North', 'South', 'East', 'North', 'South']
})

# Test 1: Simple calculation
print("=" * 60)
print("TEST 1: Simple Calculation")
print("=" * 60)

code1 = """
print("Total Sales:", df['Sales'].sum())
print("Average Sales:", df['Sales'].mean())
print("Max Sales:", df['Sales'].max())

result = df.groupby('Region')['Sales'].sum()
"""

output1 = executor.run_code(code1, extra_context={'df': df})

print(f"‚úÖ Success: {output1['success']}")
print(f"Output:\n{output1['stdout']}")
if output1['result'] is not None:
    print(f"Result:\n{output1['result']}")

# Test 2: Create visualization
print("\n" + "=" * 60)
print("TEST 2: Create Visualization")
print("=" * 60)

code2 = """
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(df['Product'], df['Sales'], color='steelblue')
plt.title('Sales by Product')
plt.xlabel('Product')
plt.ylabel('Sales')
plt.grid(axis='y', alpha=0.3)
"""

output2 = executor.run_code(
    code2,
    extra_context={'df': df},
    plot_filename='test_bar_chart.png'
)

print(f"‚úÖ Success: {output2['success']}")
print(f"Plot saved at: {output2['plot_path']}")

# Test 3: Error handling
print("\n" + "=" * 60)
print("TEST 3: Error Handling")
print("=" * 60)

code3 = """
# This will cause an error
result = df['NonExistentColumn'].sum()
"""

output3 = executor.run_code(code3, extra_context={'df': df})

print(f"‚ùå Success: {output3['success']}")
print(f"Error:\n{output3['error']}")

print("\n‚úÖ Code Executor tests complete!")


In [None]:
# Cell 15: Test Analysis Agent

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

import pandas as pd
from utils.logger import AgentLogger
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from config import Config

print("üöÄ Testing Analysis Agent\n")

# Initialize services
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()

# Load data with Ingest Agent
print("Step 1: Loading data...")
ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

print("\nStep 2: Running analysis...")
analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

# Display results
print("\n" + "="*60)
print("ANALYSIS RESULTS")
print("="*60)

print(f"\nüìä Dataset: {analysis_results['dataset_shape']}")
print(f"üìà Numeric columns: {', '.join(analysis_results['numeric_columns'][:3])}")
print(f"üìù Categorical columns: {', '.join(analysis_results['categorical_columns'][:2])}")

print(f"\nüìä Visualizations created: {len(analysis_results['visualizations'])}")
for viz in analysis_results['visualizations']:
    print(f"  ‚Ä¢ {viz['type']}: {viz['description']}")

print(f"\nüí° Insights: {len(analysis_results['insights'])}")
for insight in analysis_results['insights']:
    print(f"  ‚Ä¢ [{insight['severity']}] {insight['text']}")

# Show session state
print("\n" + "="*60)
print("SESSION STATE")
print("="*60)
summary = session.get_session_summary()
print(f"\nTotal insights: {summary['total_insights']}")
print(f"Total plots: {summary['total_plots']}")
print(f"Agents executed: {', '.join(summary['agents_executed'])}")

print("\n‚úÖ Analysis Agent test complete!")


In [None]:
# Cell 16: Integration Test

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService


import pandas as pd
from utils.logger import AgentLogger
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from tools.alert_tool import AlertTool, AlertSeverity
from config import Config

print("üöÄ Integration Test\n")

# ===== Initialize all services =====
print("Initializing services...")
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()
alert_tool = AlertTool(logger)

# ===== Step 1: Ingest Data =====
print("\n" + "="*60)
print("STEP 1: Data Ingestion")
print("="*60)

ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

alert_tool.send_alert(
    "Data Loaded Successfully",
    f"Loaded {df.shape[0]:,} rows √ó {df.shape[1]} columns",
    AlertSeverity.INFO,
    {'dataset': 'sales.csv', 'shape': df.shape}
)

# ===== Step 2: Analyze Data =====
print("\n" + "="*60)
print("STEP 2: Data Analysis")
print("="*60)

analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

# Alert if quality issues found
quality = profile['quality_metrics']
if quality['overall_quality'] != 'Good':
    alert_tool.send_alert(
        "Data Quality Alert",
        f"Quality level: {quality['overall_quality']}",
        AlertSeverity.WARNING
    )

# ===== Step 3: Generate Report =====
print("\n" + "="*60)
print("FINAL REPORT")
print("="*60)

summary = session.get_session_summary()

print(f"\nüìä Session Summary:")
print(f"   ID: {summary['session_id']}")
print(f"   Duration: {summary['duration_seconds']:.2f}s")
print(f"   Dataset: {summary['dataset_name']} ({summary['dataset_shape']})")

print(f"\nüìà Analysis Performed:")
print(f"   Anomalies detected: {summary['total_anomalies']}")
print(f"   Plots generated: {summary['total_plots']}")
print(f"   Insights found: {summary['total_insights']}")

print(f"\nü§ñ Agents Executed:")
for agent in summary['agents_executed']:
    print(f"   ‚Ä¢ {agent}")

print(f"\nüîî Alerts Sent: {len(alert_tool.get_all_alerts())}")
for alert in alert_tool.get_all_alerts():
    print(f"   ‚Ä¢ [{alert['severity']}] {alert['title']}")

print(f"\n{'='*60}")
print("‚úÖ Day 2 Integration Complete!")
print(f"{'='*60}")


In [None]:
# Cell 17: Test Parallel Anomaly Detection

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

import pandas as pd
import numpy as np
from utils.logger import AgentLogger
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from agents.anomaly_agent import AnomalyAgent
from config import Config

print("üöÄ Testing Parallel Anomaly Detection\n")

# Initialize services
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()

# Load data
print("Step 1: Loading data...")
ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

# Run analysis
print("\nStep 2: Running analysis...")
analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

# Run anomaly detection (PARALLEL)
print("\nStep 3: Detecting anomalies (PARALLEL execution)...")
anomaly_agent = AnomalyAgent(session, logger)
anomaly_results = anomaly_agent.detect_anomalies(df)

# Display results
print("\n" + "="*60)
print("ANOMALY DETECTION RESULTS")
print("="*60)

print(f"\nüéØ Total Anomalies: {anomaly_results['total_anomalies']}")

print(f"\nüìä By Detection Method:")
for method, count in anomaly_results['anomalies_by_method'].items():
    print(f"   {method}: {count}")

print(f"\n‚ö†Ô∏è  By Severity:")
for severity, count in anomaly_results['severity_breakdown'].items():
    if count > 0:
        print(f"   {severity.upper()}: {count}")

print(f"\nüìã Top 5 Anomalies:")
for i, anomaly in enumerate(anomaly_results['anomalies'][:5], 1):
    print(f"   {i}. [{anomaly['severity']}] {anomaly['type']} - {anomaly.get('description', 'N/A')}")

# Show session state
print("\n" + "="*60)
print("SESSION STATE")
print("="*60)
summary = session.get_session_summary()
print(f"\nTotal anomalies stored: {summary['total_anomalies']}")
print(f"Agents executed: {', '.join(summary['agents_executed'])}")

print("\n‚úÖ Parallel anomaly detection test complete!")


In [None]:
# Cell 18: Test Memory Bank
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

from services.memory_bank import MemoryBank
from config import Config

print("üöÄ Testing Memory Bank\n")

# Create memory bank
memory = MemoryBank()

# Store an analysis
analysis_1 = {
    'shape': (330, 6),
    'columns': ['date', 'product_id', 'sales_amount', 'quantity_sold', 'region'],
    'anomalies_count': 25,
    'quality_score': 96.97,
    'key_findings': [
        'Strong correlation between sales and quantity',
        'North region has higher sales',
        '25 anomalies detected'
    ]
}

memory_id_1 = memory.store_analysis('sales.csv', analysis_1)
print(f"‚úì Stored analysis: {memory_id_1}")

# Simulate a second analysis
print("\nWaiting 2 seconds...")
import time
time.sleep(2)

analysis_2 = {
    'shape': (330, 6),
    'columns': ['date', 'product_id', 'sales_amount', 'quantity_sold', 'region'],
    'anomalies_count': 18,
    'quality_score': 97.5,
    'key_findings': [
        'Similar patterns to previous analysis',
        'Fewer anomalies this time',
        'Improved data quality'
    ]
}

memory_id_2 = memory.store_analysis('sales.csv', analysis_2)
print(f"‚úì Stored analysis: {memory_id_2}")

# Find similar analyses
print("\n" + "="*60)
print("MEMORY BANK SEARCH")
print("="*60)

similar = memory.find_similar_analyses('sales.csv', max_results=5)
print(f"\nFound {len(similar)} previous analyses of sales.csv")
for i, mem in enumerate(similar, 1):
    print(f"  {i}. {mem['memory_id']} ({mem['created_at']})")

# Compare
print("\n" + "="*60)
print("ANALYSIS COMPARISON")
print("="*60)

comparison = memory.get_comparison_summary(analysis_2, 'sales.csv')
print(f"\nCurrent anomalies: {comparison['current_anomalies']}")
print(f"Previous analyses: {comparison['previous_analyses']}")

for comp in comparison['comparisons']:
    print(f"\nPrevious ({comp['date']}):")
    print(f"  Anomalies: {comp['previous_anomalies']}")
    print(f"  Change: {comp['change']:+d} ({comp['trend']})")

print("\n‚úÖ Memory Bank test complete!")


In [None]:
# Cell 19: Integration Test

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

import pandas as pd
from utils.logger import AgentLogger
from services.memory_bank import MemoryBank
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from agents.anomaly_agent import AnomalyAgent
from tools.alert_tool import AlertTool, AlertSeverity
from config import Config

print("üöÄ Integration Test\n")

# Initialize all services
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()
memory = MemoryBank()
alert_tool = AlertTool(logger)

# ===== PHASE 1: Data Ingestion =====
print("="*60)
print("PHASE 1: Data Ingestion")
print("="*60)

ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

alert_tool.send_alert(
    "Dataset Loaded",
    f"{df.shape[0]} rows √ó {df.shape[1]} columns",
    AlertSeverity.INFO
)

# ===== PHASE 2: Analysis =====
print("\n" + "="*60)
print("PHASE 2: Data Analysis")
print("="*60)

analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

# ===== PHASE 3: Anomaly Detection (PARALLEL) =====
print("\n" + "="*60)
print("PHASE 3: Anomaly Detection (PARALLEL)")
print("="*60)

anomaly_agent = AnomalyAgent(session, logger)
anomaly_results = anomaly_agent.detect_anomalies(df)

# Send alerts for high-severity anomalies
high_severity = [a for a in anomaly_results['anomalies']
                 if a.get('severity') in ['critical', 'high']]

if high_severity:
    alert_tool.send_alert(
        f"High-Severity Anomalies Detected",
        f"Found {len(high_severity)} critical/high anomalies",
        AlertSeverity.WARNING,
        {'count': len(high_severity), 'sample': high_severity[0]}
    )

# ===== PHASE 4: Memory Storage =====
print("\n" + "="*60)
print("PHASE 4: Storing Analysis in Memory")
print("="*60)

analysis_summary = {
    'shape': df.shape,
    'columns': df.columns.tolist(),
    'anomalies_count': len(anomaly_results['anomalies']),
    'quality_score': profile['quality_metrics']['completeness_score'],
    'key_findings': [
        f"Detected {len(anomaly_results['anomalies'])} anomalies",
        f"Quality: {profile['quality_metrics']['overall_quality']}",
        f"Generated {len(analysis_results['visualizations'])} visualizations"
    ]
}

memory_id = memory.store_analysis('sales.csv', analysis_summary)
print(f"‚úì Analysis stored in memory: {memory_id}")

# ===== FINAL REPORT =====
print("\n" + "="*60)
print("FINAL REPORT - DAY 3 COMPLETE")
print("="*60)

summary = session.get_session_summary()

print(f"\nüìä Session: {summary['session_id']}")
print(f"   Duration: {summary['duration_seconds']:.2f}s")
print(f"   Dataset: {summary['dataset_name']} {summary['dataset_shape']}")

print(f"\nüìà Analysis Results:")
print(f"   Insights: {summary['total_insights']}")
print(f"   Plots: {summary['total_plots']}")

print(f"\n‚ö†Ô∏è  Anomaly Detection:")
print(f"   Total: {anomaly_results['total_anomalies']}")
for severity, count in anomaly_results['severity_breakdown'].items():
    if count > 0:
        print(f"   {severity.upper()}: {count}")

print(f"\nü§ñ Agents Executed:")
for agent in summary['agents_executed']:
    print(f"   ‚úì {agent}")

print(f"\nüìé Alerts Sent: {len(alert_tool.get_all_alerts())}")

print(f"\nüíæ Memory Bank:")
print(f"   Total memories: {len(memory.get_all_memories())}")
print(f"   Latest: {memory_id}")

metrics = logger.get_metrics_summary()
print(f"\nüìä Performance Metrics:")
print(f"   Total agent calls: {metrics['total_agent_calls']}")
print(f"   Total tool calls: {metrics['total_tool_calls']}")
print(f"   Execution time: {sum(metrics['avg_execution_times'].values()):.2f}s")

print(f"\n{'='*60}")
print("‚úÖ Integration Complete!")
print("‚úÖ Parallel execution working!")
print("‚úÖ Memory Bank storing analyses!")
print(f"{'='*60}")


In [None]:
# Cell 20: Test Reporter Agent

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

import pandas as pd
from utils.logger import AgentLogger
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from agents.anomaly_agent import AnomalyAgent
from agents.reporter_agent import ReporterAgent
from config import Config

print("üöÄ Testing Reporter Agent\n")

# Initialize services
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()

# ===== Step 1: Ingest =====
print("Step 1: Loading data...")
ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

# ===== Step 2: Analyze =====
print("\nStep 2: Running analysis...")
analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

# ===== Step 3: Detect Anomalies =====
print("\nStep 3: Detecting anomalies...")
anomaly_agent = AnomalyAgent(session, logger)
anomaly_results = anomaly_agent.detect_anomalies(df)

# ===== Step 4: Generate Report =====
print("\nStep 4: Generating PDF report...")
reporter_agent = ReporterAgent(session, logger)
report_result = reporter_agent.generate_report()

# ===== Display Results =====
print("\n" + "="*60)
print("REPORT GENERATION RESULTS")
print("="*60)

print(f"\n‚úÖ Report Generated Successfully!")
print(f"\nüìÑ Report Details:")
print(f"   Filename: {report_result['report_filename']}")
print(f"   Path: {report_result['report_path']}")
print(f"   Pages: {report_result['pages']}")
print(f"   Generated: {report_result['generated_at']}")

print(f"\nüìä Report Contents:")
print(f"   Dataset: {report_result['dataset']}")
print(f"   Anomalies: {report_result['total_anomalies']}")
print(f"   Insights: {report_result['total_insights']}")
print(f"   Visualizations: {report_result['total_plots']}")

print(f"\n‚úÖ PDF Report successfully created!")
print(f"{'='*60}")

# Check if file exists
import os
if os.path.exists(report_result['report_path']):
    file_size = os.path.getsize(report_result['report_path']) / 1024
    print(f"\nüì¶ File size: {file_size:.2f} KB")
    print(f"‚úì Report is ready for download!")
else:
    print(f"\n‚ö†Ô∏è  Report file not found at: {report_result['report_path']}")


In [None]:
# Cell 21: Integration Test - Full Pipeline

# --- relative import for repo usage ---
# Ensure the 'services' folder is placed in the same folder as this notebook in the repo.
import os, sys
# Add the repo root (not Drive) so Python can import 'services'
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
# Now import from services as usual
from services.session_service import SessionService

import pandas as pd
from utils.logger import AgentLogger
from services.memory_bank import MemoryBank
from agents.ingest_agent import IngestAgent
from agents.analysis_agent import AnalysisAgent
from agents.anomaly_agent import AnomalyAgent
from agents.reporter_agent import ReporterAgent
from tools.alert_tool import AlertTool, AlertSeverity
from config import Config
import time

print("üöÄ END-TO-END INTEGRATION TEST\n")
print("="*60)
print("DataSense AI - Full Multi-Agent Analysis Pipeline")
print("="*60)

# Initialize all services
logger = AgentLogger(Config.LOGS_PATH)
session = SessionService()
memory = MemoryBank()
alert_tool = AlertTool(logger)

# ===== PHASE 1: DATA INGESTION =====
print("\n" + "="*60)
print("PHASE 1: DATA INGESTION")
print("="*60)

start_phase1 = time.time()

ingest_agent = IngestAgent(session, logger)
profile = ingest_agent.ingest_csv(f'{Config.BASE_PATH}/sample_data/sales.csv')
df = ingest_agent.get_dataframe()

phase1_time = time.time() - start_phase1

alert_tool.send_alert(
    "‚úÖ Dataset Loaded",
    f"{df.shape[0]:,} rows √ó {df.shape[1]} columns loaded successfully",
    AlertSeverity.INFO,
    {'quality': profile['quality_metrics']['overall_quality']}
)

# ===== PHASE 2: DATA ANALYSIS =====
print("\n" + "="*60)
print("PHASE 2: DATA ANALYSIS (EDA)")
print("="*60)

start_phase2 = time.time()

analysis_agent = AnalysisAgent(session, logger)
analysis_results = analysis_agent.analyze(df)

phase2_time = time.time() - start_phase2

alert_tool.send_alert(
    "‚úÖ Analysis Complete",
    f"Generated {len(analysis_results['visualizations'])} plots, {len(analysis_results['insights'])} insights",
    AlertSeverity.INFO
)

# ===== PHASE 3: ANOMALY DETECTION (PARALLEL) =====
print("\n" + "="*60)
print("PHASE 3: ANOMALY DETECTION (PARALLEL EXECUTION)")
print("="*60)

start_phase3 = time.time()

anomaly_agent = AnomalyAgent(session, logger)
anomaly_results = anomaly_agent.detect_anomalies(df)

phase3_time = time.time() - start_phase3

# Alert for anomalies
high_severity = [a for a in anomaly_results['anomalies'] if a.get('severity') in ['critical', 'high']]
if high_severity:
    alert_tool.send_alert(
        "‚ö†Ô∏è  High-Severity Anomalies",
        f"Detected {len(high_severity)} critical/high anomalies",
        AlertSeverity.WARNING
    )

# ===== PHASE 4: REPORT GENERATION =====
print("\n" + "="*60)
print("PHASE 4: REPORT GENERATION")
print("="*60)

start_phase4 = time.time()

reporter_agent = ReporterAgent(session, logger)
report_result = reporter_agent.generate_report()

phase4_time = time.time() - start_phase4

# Store in memory
analysis_summary = {
    'shape': df.shape,
    'columns': df.columns.tolist(),
    'anomalies_count': len(anomaly_results['anomalies']),
    'quality_score': profile['quality_metrics']['completeness_score'],
    'key_findings': [
        f"Detected {len(anomaly_results['anomalies'])} anomalies",
        f"Quality: {profile['quality_metrics']['overall_quality']}",
        f"Generated {len(analysis_results['visualizations'])} visualizations"
    ]
}

memory_id = memory.store_analysis('sales.csv', analysis_summary)

alert_tool.send_alert(
    "‚úÖ Report Generated",
    f"Created {report_result['pages']}-page PDF report",
    AlertSeverity.INFO,
    {'filename': report_result['report_filename']}
)

# ===== FINAL SUMMARY =====
print("\n" + "="*60)
print("FINAL SUMMARY - ALL PHASES COMPLETE")
print("="*60)

total_time = phase1_time + phase2_time + phase3_time + phase4_time

print(f"\n‚è±Ô∏è  EXECUTION TIMES:")
print(f"   Phase 1 (Ingest):        {phase1_time:.2f}s")
print(f"   Phase 2 (Analysis):      {phase2_time:.2f}s")
print(f"   Phase 3 (Anomalies):     {phase3_time:.2f}s")
print(f"   Phase 4 (Report):        {phase4_time:.2f}s")
print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"   TOTAL:                   {total_time:.2f}s")

# Session summary
session_summary = session.get_session_summary()

print(f"\nüìä ANALYSIS SUMMARY:")
print(f"   Session ID: {session_summary['session_id']}")
print(f"   Dataset: {session_summary['dataset_name']} ({session_summary['dataset_shape']})")

print(f"\nüìà FINDINGS:")
print(f"   Total Insights: {session_summary['total_insights']}")
print(f"   Total Anomalies: {session_summary['total_anomalies']}")
print(f"   Total Plots: {session_summary['total_plots']}")

print(f"\n‚ö†Ô∏è  SEVERITY BREAKDOWN:")
for severity, count in session_summary['anomalies_by_severity'].items():
    if count > 0:
        print(f"   {severity.upper()}: {count}")

print(f"\nü§ñ AGENTS EXECUTED:")
for agent in session_summary['agents_executed']:
    print(f"   ‚úì {agent}")

print(f"\nüìÑ REPORT DETAILS:")
print(f"   Filename: {report_result['report_filename']}")
print(f"   Pages: {report_result['pages']}")
print(f"   Path: {report_result['report_path']}")

print(f"\nüíæ MEMORY BANK:")
print(f"   Analysis ID: {memory_id}")
print(f"   Total Stored: {len(memory.get_all_memories())}")

# Metrics
metrics = logger.get_metrics_summary()

print(f"\nüìä PERFORMANCE METRICS:")
print(f"   Total Agent Calls: {metrics['total_agent_calls']}")
print(f"   Total Tool Calls: {metrics['total_tool_calls']}")
print(f"   Total Errors: {metrics['total_errors']}")

print(f"\nüîî ALERTS SENT: {len(alert_tool.get_all_alerts())}")

print(f"\n{'='*60}")
print("‚úÖ COMPLETE END-TO-END PIPELINE SUCCESSFUL!")
print("‚úÖ All 4 agents working together perfectly!")
print("‚úÖ Professional PDF report generated!")
print(f"{'='*60}\n")


In [None]:
# Cell: Final Test Run
import subprocess
import sys

sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

test_files = [
    'test_ingest_agent.py',
    'test_analysis_agent.py',
    'test_anomaly_agent.py',
    'test_end_to_end.py'
]

print("="*60)
print("RUNNING ALL TESTS (FINAL)")
print("="*60)

passed = 0
failed = 0

for test_file in test_files:
    print(f"\nRunning: {test_file}")

    result = subprocess.run(
        [sys.executable, f'/content/drive/MyDrive/DataSense_AI_Agent/tests/{test_file}'],
        capture_output=True,
        text=True,
        timeout=120
    )

    if result.returncode == 0:
        passed += 1
        print(f"‚úÖ PASSED")
    else:
        failed += 1
        print(f"‚ùå FAILED")

print("\n" + "="*60)
print(f"RESULTS: {passed} PASSED, {failed} FAILED out of {len(test_files)}")
print("="*60)

if failed == 0:
    print("\nüéâ ALL TESTS PASSED! Ready for Day 6!")


In [None]:
# Cell 24: Evaluation Framework
import sys
sys.path.append('/content/drive/MyDrive/DataSense_AI_Agent')

import pandas as pd
from datetime import datetime

class EvaluationFramework:
    """
    Framework for collecting and analyzing human feedback
    """

    def __init__(self):
        self.evaluations = []

    def add_evaluation(self,
                      dataset_name: str,
                      dataset_clarity: int,  # 1-5
                      insight_usefulness: int,  # 1-5
                      anomaly_relevance: int,  # 1-5
                      report_clarity: int,  # 1-5
                      overall_satisfaction: int,  # 1-5
                      comments: str = ""):
        """
        Record one person's evaluation

        Args:
            dataset_name: Name of dataset tested
            dataset_clarity: 1-5 rating
            insight_usefulness: 1-5 rating
            anomaly_relevance: 1-5 rating
            report_clarity: 1-5 rating
            overall_satisfaction: 1-5 rating
            comments: Optional feedback
        """

        evaluation = {
            'timestamp': datetime.now().isoformat(),
            'dataset': dataset_name,
            'dataset_clarity': dataset_clarity,
            'insight_usefulness': insight_usefulness,
            'anomaly_relevance': anomaly_relevance,
            'report_clarity': report_clarity,
            'overall_satisfaction': overall_satisfaction,
            'comments': comments
        }

        self.evaluations.append(evaluation)
        print(f"‚úÖ Evaluation recorded for {dataset_name}")

    def get_summary(self) -> dict:
        """
        Calculate statistics from all evaluations
        """
        if not self.evaluations:
            return {'error': 'No evaluations recorded'}

        df = pd.DataFrame(self.evaluations)

        # Calculate averages
        metrics = [
            'dataset_clarity',
            'insight_usefulness',
            'anomaly_relevance',
            'report_clarity',
            'overall_satisfaction'
        ]

        summary = {
            'total_evaluations': len(self.evaluations),
            'average_scores': {},
            'min_scores': {},
            'max_scores': {}
        }

        for metric in metrics:
            scores = df[metric].values
            summary['average_scores'][metric] = round(sum(scores) / len(scores), 2)
            summary['min_scores'][metric] = min(scores)
            summary['max_scores'][metric] = max(scores)

        return summary

    def print_report(self):
        """Print evaluation report"""
        summary = self.get_summary()

        if 'error' in summary:
            print(summary['error'])
            return

        print("\n" + "="*60)
        print("HUMAN EVALUATION REPORT")
        print("="*60)

        print(f"\nüìä Total Evaluations: {summary['total_evaluations']}")

        print(f"\nüìà Average Scores (out of 5):")

        for metric, avg_score in summary['average_scores'].items():
            # Visual bar
            filled = int(avg_score)
            bar = "‚ñà" * filled + "‚ñë" * (5 - filled)

            metric_label = metric.replace('_', ' ').title()
            print(f"   {metric_label:.<30} {bar} {avg_score:.1f}/5")

        # Overall average
        overall_avg = summary['average_scores']['overall_satisfaction']
        print(f"\n   {'Overall Satisfaction':.<30} {overall_avg:.1f}/5")

        print(f"\n{'='*60}")


# Example usage: Add sample evaluations
print("Creating Evaluation Framework...\n")

evaluator = EvaluationFramework()

# Simulated evaluations
sample_evaluations = [
    ('sales.csv', 5, 4, 4, 5, 4, 'Great tool, very clear insights'),
    ('customer_data.csv', 4, 4, 3, 4, 4, 'Good but more granular anomalies would help'),
    ('transactions.csv', 5, 5, 4, 5, 5, 'Excellent! Would definitely use again'),
]

print("Recording sample evaluations...\n")
for dataset, clarity, usefulness, relevance, report, overall, comment in sample_evaluations:
    evaluator.add_evaluation(
        dataset_name=dataset,
        dataset_clarity=clarity,
        insight_usefulness=usefulness,
        anomaly_relevance=relevance,
        report_clarity=report,
        overall_satisfaction=overall,
        comments=comment
    )

# Print report
evaluator.print_report()

# Export to CSV
evaluations_df = pd.DataFrame(evaluator.evaluations)
evaluations_df.to_csv(
    '/content/drive/MyDrive/DataSense_AI_Agent/outputs/evaluations.csv',
    index=False
)
print("\n‚úÖ Evaluations saved to: outputs/evaluations.csv")
