# Exploratory Data Analysis (EDA)

*Notebook 01_EDA*

## 1. 导入库

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 可选：%matplotlib inline

## 2. 加载数据

In [None]:
df = pd.read_csv('data/custom_covid19.csv')
# 显示前5行
print(df.head())

## 3. 数据概览

In [None]:
# 查看数据维度
print('Rows, Columns:', df.shape)

# 将特殊值 97,98,99 视为缺失
missing_vals = [97, 98, 99]
df.replace(missing_vals, np.nan, inplace=True)
# 统计缺失值
print(df.isnull().sum())

## 4. 年龄分布

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(df['AGE'].dropna(), bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

## 5. 分类变量分布

In [None]:
categorical_cols = ['SEX', 'INTUBED', 'PNEUMONIA', 'DIABETES', 'COPD',
                     'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE',
                     'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'ICU']
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    counts = df[col].value_counts(dropna=False)
    plt.bar(counts.index.astype(str), counts.values)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

## 6. 目标变量（死亡）分布

In [None]:
df['DIED'] = (df['DATE_DIED'] != '9999-99-99').astype(int)  # 同时转换成 0/1
plt.figure(figsize=(6, 4))
counts = df['DIED'].value_counts()
plt.bar(['Survived', 'Died'], counts.values)
plt.title('Death Distribution')
plt.grid(True)
plt.show()

## 7. 特征相关性

In [None]:
import seaborn as sns  # 建议加上 seaborn
# 将布尔变量编码为 0/1
num_df = df.copy()
for col in categorical_cols:
    num_df[col] = num_df[col].map({1:1, 2:0})

# 计算相关系数矩阵
corr = num_df.corr()

# 更美观的相关性热图
plt.figure(figsize=(14, 12))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Feature Correlation Heatmap')
plt.show()

## 8. 小结与下一步

- 基于上述分析，考虑特征工程和缺失值处理策略
- 准备进入模型构建阶段 (O1~O3)