In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Symptom Standardization with CTCAE\n",
    "\n",
    "This notebook demonstrates the symptom standardization pipeline using RAG and CTCAE terminology."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "# Add parent directory to path\n",
    "sys.path.insert(0, os.path.abspath('..')) \n",
    "\n",
    "# Import our modules\n",
    "from src.extractor import SymptomExtractor\n",
    "from src.standardizer import SymptomStandardizer\n",
    "from src.utils import load_clinical_notes, load_ctcae_data, save_results_to_csv\n",
    "from src.vectorstore import setup_iris_vectorstore, add_symptoms_to_vectorstore"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Create Vector Store with CTCAE Terms\n",
    "\n",
    "First, let's create a vector store with CTCAE terms for retrieval."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Load CTCAE data\n",
    "ctcae_data = load_ctcae_data('../config/ctcae_mapping.json')\n",
    "print(f\"Loaded {len(ctcae_data.get('terms', []))} CTCAE terms\")\n",
    "\n",
    "# Set up vector store\n",
    "vectorstore = setup_iris_vectorstore(\n",
    "    collection_name=\"symptoms\",\n",
    "    reset_collection=True  # Reset for demo purposes\n",
    ")\n",
    "\n",
    "# Prepare symptom documents\n",
    "symptoms = []\n",
    "for term in ctcae_data.get('terms', []):\n",
    "    for grade in term.get('grades', []):\n",
    "        symptoms.append({\n",
    "            \"id\": term.get(\"id\", \"\"),\n",
    "            \"name\": term.get(\"name\", \"\"),\n",
    "            \"description\": grade.get(\"description\", \"\"),\n",
    "            \"grade\": grade.get(\"grade\", \"\"),\n",
    "            \"category\": term.get(\"category\", \"\")\n",
    "        })\n",
    "\n",
    "# Add symptoms to vector store\n",
    "count = add_symptoms_to_vectorstore(vectorstore, symptoms)\n",
    "print(f\"Added {count} symptom documents to vector store\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Extract Symptoms from Clinical Notes\n",
    "\n",
    "Now let's extract symptoms from sample clinical notes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Load clinical notes\n",
    "documents = load_clinical_notes('../data/sample_notes')\n",
    "print(f\"Loaded {len(documents)} clinical notes\")\n",
    "\n",
    "# Display one document for reference\n",
    "if documents:\n",
    "    print(f\"Example note ({documents[0].metadata.get('filename')}):\\n\")\n",
    "    print(documents[0].page_content[:500] + \"...\")\n",
    "\n",
    "# Extract symptoms\n",
    "extractor = SymptomExtractor()\n",
    "extracted_symptoms = extractor.batch_extract(documents)\n",
    "print(f\"\\nExtracted {len(extracted_symptoms)} symptoms\")\n",
    "\n",
    "# Display first few symptoms\n",
    "extracted_symptoms[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Standardize Symptoms to CTCAE\n",
    "\n",
    "Finally, let's standardize the symptoms to CTCAE terminology."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Create standardizer\n",
    "standardizer = SymptomStandardizer(\n",
    "    ctcae_mapping_path=\"../config/ctcae_mapping.json\",\n",
    "    collection_name=\"symptoms\"\n",
    ")\n",
    "\n",
    "# Extract symptom descriptions\n",
    "symptom_texts = [s.get(\"description\", s.get(\"symptom\", \"\")) for s in extracted_symptoms]\n",
    "\n",
    "# Standardize symptoms (using just the first 5 for demo)\n",
    "standardized_symptoms = standardizer.batch_standardize(symptom_texts[:5])\n",
    "\n",
    "# Display results\n",
    "import pandas as pd\n",
    "pd.DataFrame(standardized_symptoms)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Save Results\n",
    "\n",
    "Let's save the results to file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Save as JSON\n",
    "output_path = '../data/demo_results.json'\n",
    "with open(output_path, 'w') as f:\n",
    "    json.dump(standardized_symptoms, f, indent=2)\n",
    "print(f\"Results saved to {output_path}\")\n",
    "\n",
    "# Save as CSV\n",
    "csv_path = '../data/demo_results.csv'\n",
    "save_results_to_csv(standardized_symptoms, csv_path)\n",
    "print(f\"Results saved to {csv_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}