In [1]:
#Initial imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, plot_confusion_matrix
from sklearn.metrics import confusion_matrix, plot_roc_curve, precision_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, StackingRegressor, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB


pd.set_option('display.max_columns', None)

A1/2RELATION and A1/2MENTALHEALTH could be engineered into something indicating a biological parent with poor mental health

In [2]:
df = pd.read_sas('../../data/nsch_2020_topical.sas7bdat')
mask = {
    1.0: 1.0,
    2.0: 0.0
}
df['Target'] = df['K2Q35A'].map(mask)
df.dropna(subset=['Target'], inplace=True)

In [None]:
drop_cols = ['Target', 'K2Q35A', 'K2Q35A_1_YEARS', 'K2Q35B', 'K2Q35C', 'K2Q35D', 'AUTISMMED', 'AUTISMTREAT',
             'HHLANGUAGE', 'K4Q32X01', 'K4Q32X02', 'K4Q32X03', 'K4Q32X04', 'K4Q32X05', 'DENTALSERV1',
             'DENTALSERV2', 'DENTALSERV3', 'DENTALSERV4', 'DENTALSERV5', 'DENTALSERV6', 'DENTALSERV7',
             'K11Q43R', 'A1_AGE', 'A2_AGE', 'FAMCOUNT', 'BIRTH_YR', 'BIRTH_MO', 'MEMORYCOND', 'WALKSTAIRS',
             'DRESSING', 'K12Q03', 'K12Q04', 'K12Q12', 'TRICARE', 'K11Q03R', 'HCCOVOTH', 'K3Q25', 'REPEATED',
             'K7Q30', 'K7Q31', 'K7Q37', 'BORNUSA', 'EMOSUPSPO', 'EMOSUPFAM', 'EMOSUPHCP', 'EMOSUPWOR',
             'EMOSUPADV', 'EMOSUPPEER', 'EMOSUPMHP', 'K8Q35', 'EMOSUPOTH', 'K9Q96', 'A1_SEX', 'A1_BORN',
             'A1_EMPLOYED', 'A1_MARITAL', 'A1_RELATION', 'A2_SEX', 'A2_BORN', 'A2_EMPLOYED', 'A2_GRADE',
             'A2_MARITAL', 'A2_RELATION', 'A1_ACTIVE', 'A2_ACTIVE', 'A1_PHYSHEALTH', 'A1_MENTHEALTH',
             'A2_PHYSHEALTH', 'A2_MENTHEALTH', 'K3Q21B', ]

for col in df.columns:
    null_count = df[col].isna().sum()
    null_perc = df[col].isna().sum() / len(df)
    dtype = df[col].dtype
    if null_perc > 0.5:
        drop_cols.append(col)
    if dtype == 'object':
        drop_cols.append(col)

mode_cols = ['BREATHING', 'SWALLOWING', 'STOMACH', 'PHYSICALPAIN', 'TOOTHACHES', 'GUMBLEED', 'CAVITIES', 'K2Q43B',
            'BLINDNESS', 'ALLERGIES', 'ARTHRITIS', 'K2Q40A', 'K2Q61A', 'K2Q41A', 'K2Q42A', 'HEART', 'HEADACHE',
            'K2Q38A', 'K2Q33A', 'K2Q32A', 'DOWNSYN', 'BLOOD', 'CYSTFIB', 'GENETIC', 'K2Q34A', 'K2Q36A', 'K2Q60A',
            'K2Q37A', 'K2Q30A', 'K2Q31A', 'CONCUSSION', 'K2Q05', 'K4Q23', 'ALTHEALTH', 'K4Q27', 'HOSPITALSTAY',
            'K6Q15', 'K4Q36', 'K5Q10', 'DECISIONS', 'STOPWORK', 'CUTHOURS', 'AVOIDCHG', 'K9Q40', 'K11Q60', 'K11Q61',
            'K11Q62', 'S9Q34', 'K10Q13', 'K10Q20', 'K10Q22', 'K10Q23', 'ACE3', 'ACE4', 'ACE5', 'ACE6', 'ACE7',
            'S4Q01', 'OVERWEIGHT', 'K4Q01', 'USUALGO', 'USUALSICK', 'K4Q31_R', 'CURRCOV', 'K10Q11', 'K10Q12', 'K10Q14',
            'ACE8', 'ACE9', 'ACE10', 'ACE12', 'HOWMUCH', 'HOURSLEEP', 'K8Q11', 'FOODSIT']
mode_imp = Pipeline(steps=[
    ('mode_imp', SimpleImputer(strategy='most_frequent'))
])

median_cols = ['MOMAGE', 'K2Q01', 'K2Q01_D', 'K6Q71_R', 'TALKABOUT', 'WKTOSOLVE', 'STRENGTHS', 'HOPEFUL', 'K10Q30',
               'K10Q31', 'K10Q40_R', 'GOFORHELP', 'K10Q41_R', 'K8Q31', 'K8Q32', 'K8Q34', 'C4Q04', ]
median_imp = Pipeline(steps=[
    ('median_imp', SimpleImputer(strategy='median'))
])

four_cols = ['K5Q40', 'K5Q41', 'K5Q42', 'K5Q43', 'K5Q44', 'K3Q20', 'K3Q22', 'K7Q85_R', 'K7Q84_R', 'K7Q82_R',
             'K7Q83_R', 'K7Q70_R', ]
four_imp = Pipeline(steps=[
    ('four_imp', SimpleImputer(strategy='constant', fill_value=4))
])

one_cols = ['BULLIED_R', 'BULLY', 'K7Q04R_R', 'PHYSACTIV', 'SCREENTIME', 'HCABILITY', 'K4Q20R', 'DOCROOM',
            'HOSPITALER', ]
one_imp = Pipeline(steps=[
    ('one_imp', SimpleImputer(strategy='constant', fill_value=1))
])

six_cols = ['K7Q02R_R']
six_imp = Pipeline(steps=[
    ('six_imp', SimpleImputer(strategy='constant', fill_value=6))
])

three_cols = ['WGTCONC', 'K4Q22_R', 'K4Q24_R', 'K4Q04_R']
three_imp = Pipeline(steps=[
    ('three_imp', SimpleImputer(strategy='constant', fill_value=3))
])

seven_cols = ['K4Q02_R']
seven_imp = Pipeline(steps=[
    ('seven_imp', SimpleImputer(strategy='constant', fill_value=7))
])

two_cols = ['DENTISTVISIT']
two_imp = Pipeline(steps=[
    ('two_imp', SimpleImputer(strategy='constant', fill_value=2))
])

In [18]:
X['K4Q04_R'].value_counts()

1.0    24536
3.0    10289
2.0     7582
Name: K4Q04_R, dtype: int64

In [4]:
y = df['Target']
X = df.drop(columns=['Target'])

In [22]:
for col in X.columns:
    if X[col].isna().sum() > 0:
        print(f"{col}: {X[col].isna().sum()}")

HHLANGUAGE: 151
BIRTH_MO: 260
BIRTH_YR: 379
MOMAGE: 907
K4Q32X01: 15926
K4Q32X02: 15926
K4Q32X03: 15926
K4Q32X04: 15926
K4Q32X05: 15926
DENTALSERV1: 9544
DENTALSERV2: 9544
DENTALSERV3: 9544
DENTALSERV4: 9544
DENTALSERV5: 9544
DENTALSERV6: 9544
DENTALSERV7: 9544
K11Q43R: 1083
A1_AGE: 1282
A2_AGE: 8107
FAMCOUNT: 1318
BREATHING: 80
SWALLOWING: 150
STOMACH: 207
PHYSICALPAIN: 161
TOOTHACHES: 222
GUMBLEED: 257
CAVITIES: 123
MEMORYCOND: 12356
WALKSTAIRS: 12378
DRESSING: 12391
K2Q43B: 149
BLINDNESS: 99
ALLERGIES: 63
ARTHRITIS: 453
K2Q40A: 315
K2Q61A: 138
K2Q41A: 124
K2Q42A: 105
HEART: 65
HEADACHE: 92
K2Q38A: 145
K2Q33A: 140
K2Q32A: 112
DOWNSYN: 140
BLOOD: 53
CYSTFIB: 58
GENETIC: 118
K2Q34A: 95
K2Q36A: 142
K2Q60A: 95
K2Q37A: 107
K2Q30A: 115
K2Q31A: 156
CONCUSSION: 140
K2Q05: 535
S4Q01: 70
OVERWEIGHT: 170
K4Q01: 180
USUALGO: 243
USUALSICK: 3197
K4Q31_R: 109
K4Q23: 597
ALTHEALTH: 375
K4Q27: 125
HOSPITALSTAY: 131
K6Q15: 181
K4Q36: 493
K5Q10: 225
DECISIONS: 6940
CURRCOV: 164
K12Q03: 2155
K12Q04: 21