# SDOH Data Distribution Analysis

This notebook explores distributions, missingness, and key relationships in the dashboard dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
# Load data (member view for dashboard)
df = pd.read_csv('../data/model/member_view.csv')
df.head()

In [None]:
# Basic overview
df.info()

In [None]:
# Missingness summary
missing = df.isna().mean().sort_values(ascending=False)
missing.head(20)

In [None]:
# Numeric distributions
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols].describe().T

In [None]:
# Risk & lift distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df['risk_full'], kde=True, ax=axes[0])
axes[0].set_title('Risk with SDOH')
sns.histplot(df['sdoh_lift'], kde=True, ax=axes[1])
axes[1].set_title('SDOH Lift')
plt.tight_layout()

In [None]:
# Categorical distributions
cat_cols = ['plan','race','gender','sdoh_lift_level','contract']
for col in cat_cols:
    if col in df.columns:
        display(df[col].value_counts().head(10))

In [None]:
# Risk tier consistency check
def tier(val):
    if pd.isna(val): return 'Unknown'
    if val > 2.3: return 'High'
    if val >= 1.8: return 'Moderate'
    return 'Low'
df['risk_tier_calc'] = df['risk_score_x'].apply(tier)
df[['risk_score_x','risk_tier_calc']].head()

In [None]:
# Pairwise correlations (top)
corr = df[numeric_cols].corr().abs()
corr.unstack().sort_values(ascending=False).drop_duplicates().head(20)

## Next steps
- Identify outliers and inconsistent ranges
- Adjust thresholds for risk and outreach priority
- Validate driver distributions and lift levels