In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Diabetes Data Exploration\n",
    "\n",
    "This notebook explores the diabetes dataset for insulin dose prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set style\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data\n",
    "from utils.data_generator import DiabetesDataGenerator\n",
    "\n",
    "generator = DiabetesDataGenerator()\n",
    "df = generator.generate_patient_data(1000)\n",
    "\n",
    "print(\"Dataset Shape:\", df.shape)\n",
    "print(\"\\nFirst 5 rows:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic information\n",
    "print(\"Dataset Information:\")\n",
    "print(\"=\"*50)\n",
    "df.info()\n",
    "\n",
    "print(\"\\n\\nDescriptive Statistics:\")\n",
    "print(\"=\"*50)\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "print(\"Missing Values:\")\n",
    "print(\"=\"*50)\n",
    "missing = df.isnull().sum()\n",
    "print(missing[missing > 0])\n",
    "\n",
    "print(\"\\n\\nData Types:\")\n",
    "print(\"=\"*50)\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Distribution of key variables\n",
    "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
    "\n",
    "# Glucose distribution\n",
    "axes[0,0].hist(df['glucose'], bins=30, edgecolor='black', alpha=0.7)\n",
    "axes[0,0].axvline(x=140, color='red', linestyle='--', label='Target (140)')\n",
    "axes[0,0].axvline(x=180, color='orange', linestyle='--', label='High (180)')\n",
    "axes[0,0].set_title('Blood Glucose Distribution')\n",
    "axes[0,0].set_xlabel('Glucose (mg/dL)')\n",
    "axes[0,0].set_ylabel('Frequency')\n",
    "axes[0,0].legend()\n",
    "\n",
    "# HbA1c distribution\n",
    "axes[0,1].hist(df['hba1c'], bins=30, edgecolor='black', alpha=0.7, color='green')\n",
    "axes[0,1].axvline(x=6.5, color='red', linestyle='--', label='Target (6.5%)')\n",
    "axes[0,1].axvline(x=7.0, color='orange', linestyle='--', label='High (7.0%)')\n",
    "axes[0,1].set_title('HbA1c Distribution')\n",
    "axes[0,1].set_xlabel('HbA1c (%)')\n",
    "axes[0,1].set_ylabel('Frequency')\n",
    "axes[0,1].legend()\n",
    "\n",
    "# Recommended dose distribution\n",
    "axes[0,2].hist(df['recommended_dose'], bins=30, edgecolor='black', alpha=0.7, color='purple')\n",
    "axes[0,2].set_title('Recommended Insulin Dose Distribution')\n",
    "axes[0,2].set_xlabel('Dose (units)')\n",
    "axes[0,2].set_ylabel('Frequency')\n",
    "\n",
    "# Diabetes type distribution\n",
    "type_counts = df['diabetes_type'].value_counts()\n",
    "axes[1,0].pie(type_counts.values, labels=['Type 2', 'Type 1'], autopct='%1.1f%%')\n",
    "axes[1,0].set_title('Diabetes Type Distribution')\n",
    "\n",
    "# Age distribution\n",
    "axes[1,1].hist(df['age'], bins=30, edgecolor='black', alpha=0.7, color='brown')\n",
    "axes[1,1].set_title('Age Distribution')\n",
    "axes[1,1].set_xlabel('Age (years)')\n",
    "axes[1,1].set_ylabel('Frequency')\n",
    "\n",
    "# Weight distribution\n",
    "axes[1,2].hist(df['weight'], bins=30, edgecolor='black', alpha=0.7, color='teal')\n",
    "axes[1,2].set_title('Weight Distribution')\n",
    "axes[1,2].set_xlabel('Weight (kg)')\n",
    "axes[1,2].set_ylabel('Frequency')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation heatmap\n",
    "numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
    "corr_matrix = df[numeric_cols].corr()\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f',\n",
    "            square=True, linewidths=1, cbar_kws={\"shrink\": 0.8})\n",
    "plt.title('Correlation Matrix of Numerical Features')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Top correlations with recommended_dose\n",
    "print(\"Top correlations with Recommended Dose:\")\n",
    "print(\"=\"*50)\n",
    "dose_corr = corr_matrix['recommended_dose'].sort_values(ascending=False)\n",
    "print(dose_corr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scatter plots for key relationships\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "# Glucose vs Dose\n",
    "axes[0,0].scatter(df['glucose'], df['recommended_dose'], alpha=0.6)\n",
    "axes[0,0].set_xlabel('Glucose (mg/dL)')\n",
    "axes[0,0].set_ylabel('Recommended Dose (units)')\n",
    "axes[0,0].set_title('Glucose vs Insulin Dose')\n",
    "axes[0,0].grid(True, alpha=0.3)\n",
    "\n",
    "# Weight vs Dose\n",
    "axes[0,1].scatter(df['weight'], df['recommended_dose'], alpha=0.6, color='green')\n",
    "axes[0,1].set_xlabel('Weight (kg)')\n",
    "axes[0,1].set_ylabel('Recommended Dose (units)')\n",
    "axes[0,1].set_title('Weight vs Insulin Dose')\n",
    "axes[0,1].grid(True, alpha=0.3)\n",
    "\n",
    "# HbA1c vs Dose\n",
    "axes[1,0].scatter(df['hba1c'], df['recommended_dose'], alpha=0.6, color='red')\n",
    "axes[1,0].set_xlabel('HbA1c (%)')\n",
    "axes[1,0].set_ylabel('Recommended Dose (units)')\n",
    "axes[1,0].set_title('HbA1c vs Insulin Dose')\n",
    "axes[1,0].grid(True, alpha=0.3)\n",
    "\n",
    "# Carbs vs Dose\n",
    "axes[1,1].scatter(df['carbs'], df['recommended_dose'], alpha=0.6, color='purple')\n",
    "axes[1,1].set_xlabel('Carbs (grams)')\n",
    "axes[1,1].set_ylabel('Recommended Dose (units)')\n",
    "axes[1,1].set_title('Carbs vs Insulin Dose')\n",
    "axes[1,1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interactive plot with Plotly\n",
    "fig = px.scatter_3d(df, x='glucose', y='hba1c', z='recommended_dose',\n",
    "                    color='diabetes_type', size='weight',\n",
    "                    hover_data=['age', 'carbs', 'activity'],\n",
    "                    title='3D Relationship: Glucose, HbA1c, and Insulin Dose',\n",
    "                    labels={'diabetes_type': 'Diabetes Type'})\n",
    "\n",
    "fig.update_layout(scene=dict(\n",
    "    xaxis_title='Glucose (mg/dL)',\n",
    "    yaxis_title='HbA1c (%)',\n",
    "    zaxis_title='Insulin Dose (units)'\n",
    "))\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Box plots for categorical comparisons\n",
    "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
    "\n",
    "# Diabetes type vs Dose\n",
    "df_type = df.copy()\n",
    "df_type['diabetes_type_str'] = df_type['diabetes_type'].map({1: 'Type 1', 2: 'Type 2'})\n",
    "sns.boxplot(x='diabetes_type_str', y='recommended_dose', data=df_type, ax=axes[0,0])\n",
    "axes[0,0].set_title('Insulin Dose by Diabetes Type')\n",
    "axes[0,0].set_xlabel('Diabetes Type')\n",
    "axes[0,0].set_ylabel('Recommended Dose (units)')\n",
    "\n",
    "# Activity level vs Dose\n",
    "sns.boxplot(x='activity', y='recommended_dose', data=df, ax=axes[0,1], order=['sedentary', 'light', 'moderate', 'active', 'very_active'])\n",
    "axes[0,1].set_title('Insulin Dose by Activity Level')\n",
    "axes[0,1].set_xlabel('Activity Level')\n",
    "axes[0,1].set_ylabel('Recommended Dose (units)')\n",
    "axes[0,1].tick_params(axis='x', rotation=45)\n",
    "\n",
    "# Age groups vs Dose\n",
    "df['age_group'] = pd.cut(df['age'], bins=[0, 18, 40, 60, 100], \n",
    "                         labels=['<18', '18-40', '40-60', '>60'])\n",
    "sns.boxplot(x='age_group', y='recommended_dose', data=df, ax=axes[1,0])\n",
    "axes[1,0].set_title('Insulin Dose by Age Group')\n",
    "axes[1,0].set_xlabel('Age Group')\n",
    "axes[1,0].set_ylabel('Recommended Dose (units)')\n",
    "\n",
    "# Glucose categories vs Dose\n",
    "df['glucose_category'] = pd.cut(df['glucose'], bins=[0, 100, 140, 180, 300, 500],\n",
    "                               labels=['Low', 'Normal', 'Elevated', 'High', 'Very High'])\n",
    "sns.boxplot(x='glucose_category', y='recommended_dose', data=df, ax=axes[1,1])\n",
    "axes[1,1].set_title('Insulin Dose by Glucose Category')\n",
    "axes[1,1].set_xlabel('Glucose Category')\n",
    "axes[1,1].set_ylabel('Recommended Dose (units)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pairplot for selected features\n",
    "selected_features = ['glucose', 'hba1c', 'weight', 'age', 'carbs', 'recommended_dose']\n",
    "sns.pairplot(df[selected_features], diag_kind='kde', plot_kws={'alpha': 0.6})\n",
    "plt.suptitle('Pairplot of Selected Features', y=1.02)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the dataset\n",
    "df.to_csv('../data/dummy_data.csv', index=False)\n",
    "print(\"âœ… Dataset saved to '../data/dummy_data.csv'\")\n",
    "\n",
    "# Summary\n",
    "print(\"\\nðŸ“Š Dataset Summary:\")\n",
    "print(\"=\"*50)\n",
    "print(f\"Total Patients: {len(df)}\")\n",
    "print(f\"Type 1 Diabetes: {len(df[df['diabetes_type'] == 1])} ({len(df[df['diabetes_type'] == 1])/len(df)*100:.1f}%)\")\n",
    "print(f\"Type 2 Diabetes: {len(df[df['diabetes_type'] == 2])} ({len(df[df['diabetes_type'] == 2])/len(df)*100:.1f}%)\")\n",
    "print(f\"Average Glucose: {df['glucose'].mean():.1f} mg/dL\")\n",
    "print(f\"Average HbA1c: {df['hba1c'].mean():.1f}%\")\n",
    "print(f\"Average Insulin Dose: {df['recommended_dose'].mean():.1f} units\")\n",
    "print(f\"Dose Range: {df['recommended_dose'].min():.1f} - {df['recommended_dose'].max():.1f} units\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Key Insights from Data Exploration\n",
    "\n",
    "1. **Data Quality**: No missing values, all data types appropriate\n",
    "2. **Distributions**:\n",
    "   - Glucose: Mostly between 100-250 mg/dL\n",
    "   - HbA1c: Mostly between 6-9%\n",
    "   - Insulin Dose: Mostly between 5-25 units\n",
    "3. **Correlations**:\n",
    "   - Strongest predictors of insulin dose: Weight, Glucose, Carbs\n",
    "   - HbA1c shows moderate correlation\n",
    "4. **Diabetes Type Differences**:\n",
    "   - Type 1 patients generally require higher insulin doses\n",
    "   - Type 2 shows more variability\n",
    "5. **Patterns**:\n",
    "   - Higher glucose â†’ Higher insulin dose\n",
    "   - Higher weight â†’ Higher insulin dose\n",
    "   - More carbs â†’ Higher insulin dose\n",
    "\n",
    "These insights will guide feature selection and model building."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}