# Cleaning & Merging Demographics Datasets

In [None]:
# We will use pandas library
import pandas as pd

In [None]:
# Inspecting the dataset
df_demo = pd.read_sas('/content/drive/MyDrive/Healthcare Prediction Task/data/raw/demographics/DEMO_L.xpt')
df_demo.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHRGND,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVSTRA,SDMVPSU,INDFMPIR
0,130378.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,...,,,,,,50055.450807,54374.463898,173.0,2.0,5.0
1,130379.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,...,,,,,,29087.450605,34084.721548,173.0,2.0,5.0
2,130380.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,...,,,,,,80062.674301,81196.277992,174.0,1.0,1.41
3,130381.0,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,...,2.0,2.0,2.0,3.0,,38807.268902,55698.607106,182.0,2.0,1.53
4,130382.0,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,...,2.0,2.0,3.0,1.0,2.0,30607.519774,36434.146346,182.0,2.0,3.6


### Keeping Only Relevant Columns:

In [None]:
# Columns to keep which are relevant to chronic disease prediction

df_demo = df_demo[[
    "SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH3",
    "DMDEDUC2", "DMDMARTZ", "INDFMPIR", "DMDBORN4"
]]


I dropped the following columns because:

| Column                                                     | Reason                                                            |
| ---------------------------------------------------------- | ----------------------------------------------------------------- |
| `SDDSRVYR`                                                 | All same cycle (2021–2023) → no variation                         |
| `RIDAGEMN`                                                 | Only for infants (<2 years) → mostly NaN                          |
| `RIDEXAGM`                                                 | Age in months for persons aged 19 years or younger                                   |
| `DMQMILIZ`                                                 | Military service → not relevant to disease prediction here        |
| `RIDEXMON`                                                 | Exam month → irrelevant for chronic disease risk                  |
| `DMDHRGND`, `DMDHRAGZ`, `DMDHREDZ`, `DMDHRMAZ`, `DMDHSEDZ` | Household reference person info → not the patient’s own data      |
| `RIDEXPRG`                                                 | Pregnancy status (narrow population subset, mostly missing)       |
| `WTINT2YR`                                                 | Only needed for survey weighting, not for ML                      |
| `SDMVSTRA`, `SDMVPSU`                                      | Only used for variance estimation in survey analysis              |
| `DMDYRUSR`                                                 | Years in U.S. → rarely used unless focusing on immigration health |


Keep martial status:

| Factor                         | Why it might matter                                                                                                      | Evidence                                                                                                                                                                      |
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Social & emotional support** | Married individuals often have better emotional and practical support, affecting stress, diet, and healthcare behaviors. | CDC and NIH studies show lower rates of cardiovascular disease and mortality among married individuals ([CDC, 2022](https://www.cdc.gov/nchs/products/databriefs/db463.htm)). |
| **Economic stability**         | Married individuals may have higher combined income and better access to healthcare.                                     | NHIS/CDC analyses show income and marital status jointly influence chronic condition prevalence.                                                                              |
| **Health behavior linkage**    | Lifestyle factors (smoking, alcohol, diet) differ statistically by marital status.                                       | Research in *Journal of Health and Social Behavior* (2018) links marital transitions to changes in health risk.                                                               |


## Renaming columns for readability

In [None]:
df_demo = df_demo.rename(columns={
    'SEQN': 'ID',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age',
    'RIDRETH3': 'Ethnicity',
    'DMDBORN4': 'Birth_Country',
    'DMDEDUC2': 'Education',
    'DMDMARTZ': 'Marital_Status',
    'INDFMPIR': 'Income_Ratio',
})


In [None]:
df_demo.head()

Unnamed: 0,ID,Gender,Age,Ethnicity,Education,Marital_Status,Income_Ratio,Birth_Country
0,130378.0,1.0,43.0,6.0,5.0,1.0,5.0,2.0
1,130379.0,1.0,66.0,3.0,5.0,1.0,5.0,1.0
2,130380.0,2.0,44.0,2.0,3.0,1.0,1.41,2.0
3,130381.0,2.0,5.0,7.0,,,1.53,1.0
4,130382.0,1.0,2.0,3.0,,,3.6,1.0


In [None]:
df_demo.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              11933 non-null  float64
 1   Gender          11933 non-null  float64
 2   Age             11933 non-null  float64
 3   Ethnicity       11933 non-null  float64
 4   Education       7794 non-null   float64
 5   Marital_Status  7792 non-null   float64
 6   Income_Ratio    9892 non-null   float64
 7   Birth_Country   11914 non-null  float64
dtypes: float64(8)
memory usage: 745.9 KB


In [None]:
# Filtering people less than <20 years old

df_demo = df_demo[df_demo['Age'] >= 20]


because It's not relevant for including data about infants for detecting chornic diseases , which appear in middle age or later (20 years old). , also they would bias my model.

In [None]:
df_demo.isnull().sum()

Unnamed: 0,0
ID,0
Gender,0
Age,0
Ethnicity,0
Education,15
Marital_Status,17
Income_Ratio,1320
Birth_Country,16


In [None]:
# Dropping Null values

df_demo.dropna(subset=['Education', 'Marital_Status', 'Birth_Country'], inplace=True)


In [None]:
# Replacing Income ratio with the median instead of dropping it (not the mean so not affected by outliers)
df_demo['Income_Ratio'] = df_demo['Income_Ratio'].fillna(df_demo['Income_Ratio'].median())

Will fill nulls with the median because Income_Ratio is a ratio of family income to poverty level with 1 as poverty threshold, filling missing values with the median preserves the overall data distribution and avoids biasing the model — unlike 0, which would falsely imply extreme poverty.

In [None]:
df_demo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7792 entries, 0 to 11932
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              7792 non-null   float64
 1   Gender          7792 non-null   float64
 2   Age             7792 non-null   float64
 3   Ethnicity       7792 non-null   float64
 4   Education       7792 non-null   float64
 5   Marital_Status  7792 non-null   float64
 6   Income_Ratio    7792 non-null   float64
 7   Birth_Country   7792 non-null   float64
dtypes: float64(8)
memory usage: 547.9 KB


In [None]:
df_demo.head()

Unnamed: 0,ID,Gender,Age,Ethnicity,Education,Marital_Status,Income_Ratio,Birth_Country
0,130378.0,1.0,43.0,6.0,5.0,1.0,5.0,2.0
1,130379.0,1.0,66.0,3.0,5.0,1.0,5.0,1.0
2,130380.0,2.0,44.0,2.0,3.0,1.0,1.41,2.0
6,130384.0,1.0,43.0,1.0,2.0,3.0,0.63,2.0
7,130385.0,2.0,65.0,3.0,3.0,1.0,5.0,1.0


In [None]:
df_demo.to_csv('/content/drive/MyDrive/Healthcare Prediction Task/data/cleaned/demographics.csv', index=False)