# Import

In [1]:
import pandas as pd
import numpy as np
from functools import reduce

# Load Dataset

In [2]:
# SEQN - Respondent sequence number
# 2015-2016
df_demo1 = pd.read_sas("2015-2016_DEMO_I.XPT")#Demographic Variables and Sample Weights,1)Gender:'RIAGENDR', 2)Age:'RIDAGEYR', 3)Pregnancy:'RIDEXPRG'
df_demo1 = df_demo1[['SEQN','RIAGENDR', 'RIDAGEYR', 'RIDEXPRG']]

df_sleep1 = pd.read_sas("2015-2016_SLQ_I.XPT")#Sleep Disorders, *1)Sleep hours:'SLD012', *2)How often do you snore?:'SLQ030', 3)How often do you snort or stop breathing:'SLQ040', 4)Ever told doctor had trouble sleeping?:'SLQ050'
df_sleep1 = df_sleep1[['SEQN', 'SLQ040', 'SLQ050']]

df_physical1 = pd.read_sas("2015-2016_PAQ_I.XPT")#Physical Activity,1)Vigorous work activity:'PAQ605', 2)Moderate work activity:'PAQ620'
df_physical1 = df_physical1[['SEQN', 'PAQ605', 'PAQ620']]

df_smoking1 = pd.read_sas("2015-2016_SMQ_I.XPT")#Smoking - Cigarette Use, 1)Do you now smoke cigarettes?:'SMQ040'
df_smoking1 = df_smoking1[['SEQN', 'SMQ040']]

df_alcohol1 = pd.read_sas("2015-2016_ALQ_I.XPT")#Alcohol Use: 1)Avg alcoholic drinks/day - past 12 mos:'ALQ130'
df_alcohol1 = df_alcohol1[['SEQN', 'ALQ130']]

df_diabetes1 = pd.read_sas("2015-2016_DIQ_I.XPT")#Diabetes: 1)Doctor told you have diabetes: 'DIQ010'
df_diabetes1 = df_diabetes1[['SEQN', 'DIQ010']]

df_bmi1 = pd.read_sas("2015-2016_BMX_I.XPT")#Body Mass Index:'BMXBMI'
df_bmi1 = df_bmi1[['SEQN', 'BMXBMI']]

# 2017-2018
df_demo2 = pd.read_sas("2017-2018_DEMO_J.XPT")#Demographic Variables and Sample Weights,1)Gender:'RIAGENDR', 2)Age:'RIDAGEYR', 3)Pregnancy:'RIDEXPRG'
df_demo2 = df_demo2[['SEQN','RIAGENDR', 'RIDAGEYR', 'RIDEXPRG']]

df_sleep2 = pd.read_sas("2017-2018_SLQ_J.XPT")#Sleep Disorders, 1)Sleep hours:'SLD012', 2)How often do you snore?:'SLQ030', 3)How often do you snort or stop breathing:'SLQ040', 4)Ever told doctor had trouble sleeping?:'SLQ050'
df_sleep2 = df_sleep2[['SEQN', 'SLQ040', 'SLQ050']]

df_physical2 = pd.read_sas("2017-2018_PAQ_J.XPT")#Physical Activity,1)Vigorous work activity:'PAQ605', 2)Moderate work activity:'PAQ620'
df_physical2 = df_physical2[['SEQN', 'PAQ605', 'PAQ620']]

df_smoking2 = pd.read_sas("2017-2018_SMQ_J.XPT")#Smoking - Cigarette Use, 1)SMQ040 - Do you now smoke cigarettes?:'SMQ040'
df_smoking2 = df_smoking2[['SEQN', 'SMQ040']]

df_alcohol2 = pd.read_sas("2017-2018_ALQ_J.XPT")#Alcohol Use: 1)Avg alcoholic drinks/day - past 12 mos:'ALQ130'
df_alcohol2 = df_alcohol2[['SEQN', 'ALQ130']]

df_diabetes2 = pd.read_sas("2017-2018_DIQ_J.XPT")#Diabetes: 1)Doctor told you have diabetes: 'DIQ010'
df_diabetes2 = df_diabetes2[['SEQN', 'DIQ010']]

df_bmi2 = pd.read_sas("2017-2018_BMX_J.XPT")#Body Mass Index:'BMXBMI'
df_bmi2 = df_bmi2[['SEQN', 'BMXBMI']]

In [3]:
# 2015-2016 merged by 'SEQN'
dfs1 = [df_demo1, df_sleep1, df_physical1, df_smoking1, df_alcohol1, df_diabetes1, df_bmi1]
df_2015 = reduce(lambda left, right: pd.merge(left, right, on='SEQN', how='inner'), dfs1)

# 2017-2018 merged by 'SEQN'
dfs2 = [df_demo2, df_sleep2, df_physical2, df_smoking2, df_alcohol2, df_diabetes2, df_bmi2]
df_2017 = reduce(lambda left, right: pd.merge(left, right, on='SEQN', how='inner'), dfs2)

# 2015-2016 & 2017-2018 merge
df = pd.concat([df_2015, df_2017], axis=0)

df.shape

(11268, 12)

# Data Preprocessing

In [4]:
#Demographic Variables

# Drop pregnancy
df = df[df['RIDEXPRG'] != 1]
df = df.drop('RIDEXPRG', axis=1)
# Filter Age older than 18
df = df[(df['RIDAGEYR'] >= 18)]

In [5]:
# Filter valid data about Sleep Disorder
valid_values = [0, 1, 2, 3]
df = df[df['SLQ040'].isin(valid_values) | df['SLQ050'].isin(valid_values)]

In [6]:
# Filter valid data about Physical Activity, Smoking, Diebetes
valid_values = [1, 2, 3]
df = df[(df['PAQ605'].isin(valid_values) | df['PAQ620'].isin(valid_values)) & df['SMQ040'].isin(valid_values) & df['DIQ010'].isin(valid_values)]

In [7]:
# Filter valid data about Alcohol
valid_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
df = df[df['ALQ130'].isin(valid_values)]

In [8]:
# Filter valid data about BMI
columns_to_check = ['BMXBMI']

for col in columns_to_check:
    df = df[pd.to_numeric(df[col], errors='coerce').notna()]

In [9]:
# Filter valid data about BMI
df = df[pd.to_numeric(df['BMXBMI'], errors='coerce').notna()]

In [10]:
#Replace outliers with 0
df[df < 1e-10] = 0
df.shape

(3054, 11)

In [11]:
count_male = df['RIAGENDR'].value_counts().get(1, 0)
count_female = df['RIAGENDR'].value_counts().get(2, 0)

print(f'male：{count_male}')
print(f'female：{count_female}')

male：1904
female：1150


# Set categorical features

In [12]:
# add category: 0-no sleep problems;  1-sleep disorder
def apply_condition_sleep(df):
    if df['SLQ040'] in [1, 2, 3] or df['SLQ050'] == 1:
        return 1
    else:
        return 0

df['sleep_disorder'] = df.apply(apply_condition_sleep, axis=1)
sleep_disorder = df['sleep_disorder'].sum()
print(f'sleep disorder：{sleep_disorder}')

sleep disorder：1474


In [13]:
# 0-no physical activity; 1-have physical activity
def apply_condition_physical(df):
    if df['PAQ605'] ==1 or df['PAQ620'] == 1:
        return 1
    else:
        return 0

df['physical_activity'] = df.apply(apply_condition_physical, axis=1)
physical_activity = df['physical_activity'].sum()
print(f'physical_activity：{physical_activity}')

physical_activity：1666


In [14]:
# 0-no smoking; 1-smoking
def apply_condition_smoking(df):
    if df['SMQ040'] ==3:
        return 0
    else:
        return 1

df['smoking'] = df.apply(apply_condition_smoking, axis=1)
smoking = df['smoking'].sum()
print(f'smoking：{smoking}')

smoking：1465


In [15]:
# 0-no or less alcohol; 1-alcohol
def apply_condition_alcohol(df):
    if df['ALQ130'] ==15:
        return 1
    else:
        return 0

df['alcohol'] = df.apply(apply_condition_alcohol, axis=1)
alcohol = df['alcohol'].sum()
print(f'alcohol：{alcohol}')

alcohol：25


In [16]:
# 0-no diabetes; 1-diabetes
def apply_condition_diabetes(df):
    if df['DIQ010'] ==1:
        return 1
    else:
        return 0

df['diabetes'] = df.apply(apply_condition_diabetes, axis=1)
diabetes = df['diabetes'].sum()
print(f'diabetes：{diabetes}')

diabetes：438


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
feature_to_scale = 'BMXBMI'
df['BMXBMI'] = scaler.fit_transform(df[['BMXBMI']])

In [17]:
# bin

df['BMI_bins'] = pd.cut(df['BMXBMI'], bins=[0, 24.9, 29.9, float('inf')], labels=['Normal Weight', 'Overweight', 'Obesity'])

normal_count = df['BMI_bins'].value_counts().get('Normal Weight', 0)
Overweight_count = df['BMI_bins'].value_counts().get('Overweight', 0)
Obesity_count = df['BMI_bins'].value_counts().get('Obesity', 0)

print(f'The count of "Normal Weight" in the column is: {normal_count}')
print(f'The count of "Overweight" in the column is: {Overweight_count}')
print(f'The count of "Obesity" in the column is: {Obesity_count}')

The count of "Normal Weight" in the column is: 791
The count of "Overweight" in the column is: 976
The count of "Obesity" in the column is: 1287


In [18]:
df_ml = df[['sleep_disorder', 'physical_activity', 'smoking', 'diabetes', 'alcohol', 'BMI_bins']]
df_ml.head()

Unnamed: 0,sleep_disorder,physical_activity,smoking,diabetes,alcohol,BMI_bins
0,1,1,0,1,0,Overweight
1,0,0,1,0,0,Obesity
6,0,0,1,0,0,Overweight
10,1,0,1,0,0,Overweight
11,0,0,0,0,0,Normal Weight
