In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression, mutual_info_regression
from label_parsing import label_parser
import matplotlib.pyplot as plt

# Read in Data Set + Info

In [4]:
df = pd.read_csv('adult21.csv')
df.head()

Unnamed: 0,URBRRL,RATCAT_A,IMPINCFLG_A,CVDVAC2YR_A,CVDVAC2MR_A,CVDVAC1YR_A,CVDVAC1MR_A,SHTCVD19AV_A,SHTCVD19NM_A,SHTCVD19_A,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,IMPNUM_A,WTFA_A,HHX,POVRATTC_A
0,4,7,0,,,,,,,,...,,,1,1,1,10,1,5423.324,H056808,1.93
1,4,12,0,,,,,,,,...,,,1,1,1,10,1,3832.196,H018779,4.45
2,4,14,0,,,,,,,,...,,,1,1,1,10,1,3422.661,H049265,5.94
3,3,11,0,,,,,,,,...,,,1,1,1,10,1,12960.165,H007699,3.7
4,1,6,1,,,,,,,,...,,,1,1,1,10,1,9284.618,H066034,1.66


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29482 entries, 0 to 29481
Columns: 622 entries, URBRRL to POVRATTC_A
dtypes: float64(437), int64(184), object(1)
memory usage: 139.9+ MB


# Cleaning

In [6]:
# Vars
countColNan = df.isnull().sum(axis=0).tolist() # Column NaN counts
numRows = len(df) # Number of rows
colNames = df.columns

percList = [((x / numRows)*100) for x in countColNan] # Percent Missing Data

addBool = [ind < 1  for ind in percList] 
addList = []

for i in range(len(addBool)):
    if addBool[i]:
        addList.append(colNames[i])



dfNew = df[addList].copy()
dfNew = dfNew.drop(['HHX'], axis=1)
dfNew.dropna()

print("Number of remaining Columns:", len(dfNew.columns))

goodCols = ['EDUCP_A', 'PHSTAT_A','LSATIS11R_A', 'LSATIS4R_A', 'ANXEV_A', 'DEPEV_A','WEIGHTLBTC_A','NOTCOV_A']#'WELLVIS_A']

dfPCA = dfNew[goodCols]

Number of remaining Columns: 189


In [10]:
df = pd.read_csv('adult21.csv')

# threshold for null values
null_thresh = 0.01

pruned_df = df[df.columns[df.isnull().mean() < null_thresh]]

print(len(pruned_df.columns))
# Set up the label parser
lp = label_parser()

for i in pruned_df.columns:
    print(lp.find_var_label(i))

190
2013 NCHS Urban-Rural Classification Scheme for Counties
Ratio of family income to poverty threshold for SA's family
Imputed SA family income imputation flag
Pseudo-PSU for public-use file variance estimation
Pseudo-stratum for public-use file variance estimation
Single and multiple race groups with Hispanic origin
Single and multiple race groups
The Washington Group Short Set Composite Disability Indicator
Experienced serious psychological distress - K6 scale
Citizenship status
Legal marital status of sample adult
Current marital status of sample adult
Number of adults in sample adult's family who are working full-time
Number of adults in sample adult's family who are working
Adult  4 category food security recode
Adult 3 category food security recode
Worked last week
Electronic cigarette use status
Cigarette smoking status
Categorical Body Mass Index, public use
Weight without shoes (pounds), public use
Height without shoes (inches), public use
Number of times visited urgent care

# Dimension Reduction

In [13]:
# print(set(dfNew.dtypes))
dfTest = dfNew.select_dtypes(['int64'])
# print(set(dfTest.dtypes))

print("Targeting by weight (chi2)")
target = 'WEIGHTLBTC_A'

features = list(dfTest.columns)
features.remove(target)

topList = []
y = dfTest[target]
X = dfTest[features]

bestfeatures = SelectKBest(score_func=chi2, k=15)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  
topList = featureScores.nlargest(15,'Score')
df1 = pd.DataFrame(topList)

nameList = []
for i in df1["Specs"]:
    nameList.append(lp.find_var_label(i))

df1["Description"]= nameList
df1

Targeting by weight (chi2)


Unnamed: 0,Specs,Score,Description
20,HEIGHTTC_A,24261.5874,"Height without shoes (inches), public use"
19,BMICAT_A,10956.425163,"Categorical Body Mass Index, public use"
3,PPSU,6217.966962,Pseudo-PSU for public-use file variance estima...
50,AGEP_A,5460.670727,Age of SA (top coded)
26,LSATIS11R_A,4357.351769,Life satisfaction - 11 category question
36,EDUCP_A,1779.558553,Educational level of sample adult
49,SEX_A,1000.759805,Sex of Sample Adult
22,EMERG12MTC_A,895.627899,Number of times visited hospital emergency roo...
1,RATCAT_A,845.284013,Ratio of family income to poverty threshold fo...
177,PHSTAT_A,735.434179,General health status


In [14]:
print("Targeting by life satisfaction (chi2)")
target = 'LSATIS11R_A'

features = list(dfTest.columns)
features.remove(target)

topList = []
y = dfTest[target]
X = dfTest[features]

bestfeatures = SelectKBest(score_func=chi2, k=15)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  
topList = featureScores.nlargest(15,'Score')
df2 = pd.DataFrame(topList)

nameList = []
for i in df2["Specs"]:
    nameList.append(lp.find_var_label(i))

df2["Description"]= nameList
df2

Targeting by life satisfaction (chi2)


Unnamed: 0,Specs,Score,Description
20,WEIGHTLBTC_A,29439.183665,"Weight without shoes (pounds), public use"
9,CITZNSTP_A,11008.884074,Citizenship status
61,NATUSBORN_A,10049.626626,Born in U.S. or U.S. territory
15,FDSCAT3_A,9879.960904,Adult 3 category food security recode
16,EMPWRKLSW1_A,9669.733605,Worked last week
14,FDSCAT4_A,9225.390292,Adult 4 category food security recode
59,INCWRKO_A,8025.874761,Income from wages
75,TASTEDF_A,7980.873946,Difficulty tasting
52,HOUTENURE_A,7693.939514,Residence owned/rented
78,SMELLDF_A,7472.813458,Difficulty smelling
