# Exploring the NHANES diabetes data

[DSLC stages]: EDA


You can conduct an EDA of the NHANES diabetes dataset in this document.

In [5]:
import pandas as pd
import numpy as np

# Load the datasets from CSV files
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

data_States = pd.read_csv('data_States.csv')
data_register = pd.read_csv('data_register.csv')
data_income = pd.read_csv('data_income.csv')
data_noVote = pd.read_csv('data_noVote.csv')

In [7]:
from scipy.stats import ttest_rel, wilcoxon
# Separate PRE and POST columns
pre_columns = [col for col in train_data.columns if col.startswith("PRE_")]
post_columns = [col for col in train_data.columns if col.startswith("POST_")]

# Match PRE and POST columns (ignoring "PRE_" and "POST_" prefixes)
pre_post_pairs = {
    pre: pre.replace("PRE_", "POST_") for pre in pre_columns if pre.replace("PRE_", "POST_") in post_columns
}

# Prepare a DataFrame to store test results
results = []

# Compare each PRE-POST pair
for pre_col, post_col in pre_post_pairs.items():
    pre_data = train_data[pre_col].dropna()
    post_data = train_data[post_col].dropna()
    
    # Ensure equal length for paired testing
    paired_data = pd.DataFrame({"pre": pre_data, "post": post_data}).dropna()
    pre_values = paired_data["pre"].astype(float)  # Convert to numeric type
    post_values = paired_data["post"].astype(float)  # Convert to numeric type
    
    # Perform paired t-test (use Wilcoxon if data is non-normal)
    try:
        t_stat, p_value = ttest_rel(pre_values, post_values)
        test_type = "t-test"
    except ValueError:
        t_stat, p_value = wilcoxon(pre_values, post_values)
        test_type = "Wilcoxon"
    
    # Store results
    results.append({
        "Variable": pre_col.replace("PRE_", ""),
        "PRE_Mean": pre_values.mean(),
        "POST_Mean": post_values.mean(),
        "Test_Type": test_type,
        "T_Statistic": t_stat,
        "P_Value": p_value
    })

# Convert results to a DataFrame
comparison_results = pd.DataFrame(results)

# Display significant changes (p-value < 0.05)
significant_changes = comparison_results[comparison_results["P_Value"] < 0.05]

print("Significant Changes from Pre to Post:")
print(significant_changes)

Significant Changes from Pre to Post:
                      Variable   PRE_Mean  POST_Mean Test_Type  T_Statistic  \
0                  ThermoBiden  49.034334  51.146308    t-test    -7.706926   
1                  ThermoTrump  40.677191  39.789337    t-test     3.522874   
2                 ThermoHarris  47.809179  49.418219    t-test    -5.677093   
3                  ThermoPence  44.386301  47.277778    t-test   -10.487091   
4                 VoteAccuracy   3.074189   2.292616    t-test    31.804574   
5   VotePresident_Donald Trump   0.449620   0.493789    t-test    -9.535147   
6   VotePresident_Jo Jorgensen   0.030711   0.010870    t-test     8.699439   
7  VotePresident_Howie Hawkins   0.007246   0.002588    t-test     4.030211   

         P_Value  
0   1.506775e-14  
1   4.301856e-04  
2   1.436510e-08  
3   1.673482e-25  
4  9.521637e-205  
5   2.146756e-21  
6   4.288633e-18  
7   5.644072e-05  
