# Data Overview - Clinical Deterioration Prediction

This notebook provides an initial overview and comparison of the two datasets:
- **3-year dataset** (CNUH_3Y.csv): ~317k rows
- **10-year dataset** (10yrs_proc.csv): ~38k rows

## Objectives
1. Load and validate both datasets
2. Compare dataset structures
3. Initial data quality assessment
4. Feature mapping between datasets

In [None]:
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import (
    load_3year_dataset, 
    load_10year_dataset, 
    validate_dataset, 
    get_feature_groups
)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Load Datasets

In [None]:
# Load 3-year dataset
print("Loading 3-year dataset...")
df_3year = load_3year_dataset("../data/CNUH_3Y.csv")
print(f"✓ Loaded: {df_3year.shape[0]:,} rows × {df_3year.shape[1]} columns")

# Load 10-year dataset
print("\nLoading 10-year dataset...")
df_10year = load_10year_dataset("../data/10yrs_proc.csv")
print(f"✓ Loaded: {df_10year.shape[0]:,} rows × {df_10year.shape[1]} columns")

## 2. Dataset Structure Comparison

In [None]:
# Compare column names
cols_3year = set(df_3year.columns)
cols_10year = set(df_10year.columns)

print("=" * 60)
print("COLUMN COMPARISON")
print("=" * 60)
print(f"\n3-year dataset columns: {len(cols_3year)}")
print(f"10-year dataset columns: {len(cols_10year)}")
print(f"\nCommon columns: {len(cols_3year & cols_10year)}")
print(f"Unique to 3-year: {len(cols_3year - cols_10year)}")
print(f"Unique to 10-year: {len(cols_10year - cols_3year)}")

print("\n" + "-" * 60)
print("Columns unique to 3-year dataset:")
print("-" * 60)
for col in sorted(cols_3year - cols_10year):
    print(f"  - {col}")

print("\n" + "-" * 60)
print("Columns unique to 10-year dataset:")
print("-" * 60)
for col in sorted(cols_10year - cols_3year):
    print(f"  - {col}")

print("\n" + "-" * 60)
print("Common columns (first 10):")
print("-" * 60)
common_cols = sorted(cols_3year & cols_10year)
for col in common_cols[:10]:
    print(f"  - {col}")
if len(common_cols) > 10:
    print(f"  ... and {len(common_cols) - 10} more")

## 3. Dataset Validation

In [None]:
# Validate 3-year dataset
print("=" * 60)
print("3-YEAR DATASET VALIDATION")
print("=" * 60)
val_3year = validate_dataset(df_3year, "3-year dataset")
print(f"Shape: {val_3year['shape']}")
print(f"Memory usage: {val_3year['memory_usage_mb']:.2f} MB")
print(f"Duplicate rows: {val_3year['duplicate_rows']:,}")

if 'target_distribution' in val_3year:
    print(f"\nTarget distribution:")
    for k, v in val_3year['target_distribution'].items():
        pct = v / val_3year['shape'][0] * 100
        print(f"  Class {k}: {v:,} ({pct:.2f}%)")

print(f"\nMissing values (top 10):")
missing_3year = pd.Series(val_3year['missing_percentage']).sort_values(ascending=False)
for col, pct in missing_3year.head(10).items():
    if pct > 0:
        print(f"  {col}: {pct:.2f}%")

In [None]:
# Validate 10-year dataset
print("=" * 60)
print("10-YEAR DATASET VALIDATION")
print("=" * 60)
val_10year = validate_dataset(df_10year, "10-year dataset")
print(f"Shape: {val_10year['shape']}")
print(f"Memory usage: {val_10year['memory_usage_mb']:.2f} MB")
print(f"Duplicate rows: {val_10year['duplicate_rows']:,}")

if 'target_distribution' in val_10year:
    print(f"\nTarget distribution:")
    for k, v in val_10year['target_distribution'].items():
        pct = v / val_10year['shape'][0] * 100
        print(f"  Class {k}: {v:,} ({pct:.2f}%)")

print(f"\nMissing values (top 10):")
missing_10year = pd.Series(val_10year['missing_percentage']).sort_values(ascending=False)
for col, pct in missing_10year.head(10).items():
    if pct > 0:
        print(f"  {col}: {pct:.2f}%")

## 4. Feature Groups Analysis

In [None]:
# Get feature groups for both datasets
features_3year = get_feature_groups(df_3year)
features_10year = get_feature_groups(df_10year)

print("=" * 60)
print("FEATURE GROUPS - 3-YEAR DATASET")
print("=" * 60)
for group, features in features_3year.items():
    if features:
        print(f"\n{group.upper().replace('_', ' ')} ({len(features)} features):")
        for feat in features:
            print(f"  - {feat}")

print("\n" + "=" * 60)
print("FEATURE GROUPS - 10-YEAR DATASET")
print("=" * 60)
for group, features in features_10year.items():
    if features:
        print(f"\n{group.upper().replace('_', ' ')} ({len(features)} features):")
        for feat in features:
            print(f"  - {feat}")

## 5. Sample Data Preview

In [None]:
print("3-YEAR DATASET - First 5 rows:")
print("=" * 60)
display(df_3year.head())

print("\n3-YEAR DATASET - Basic Statistics:")
print("=" * 60)
display(df_3year.describe())

In [None]:
print("10-YEAR DATASET - First 5 rows:")
print("=" * 60)
display(df_10year.head())

print("\n10-YEAR DATASET - Basic Statistics:")
print("=" * 60)
display(df_10year.describe())

## 6. Summary and Key Findings

### Dataset Comparison Summary:
1. **Size**: 3-year dataset is ~8x larger (317k vs 38k rows)
2. **Features**: Both datasets contain similar clinical features (vital signs, lab values)
3. **Target**: Both have binary target variable (0 = normal, 1 = deterioration)
4. **Temporal**: Both include time-based features for tracking patient measurements

### Next Steps:
- Detailed EDA for each dataset
- Target variable analysis
- Feature importance analysis
- Problem definition