In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression, mutual_info_regression
from label_parsing import label_parser
import matplotlib.pyplot as plt

# Read in Data Set + Info

In [4]:
df = pd.read_csv('adult21.csv')
df.head()

Unnamed: 0,URBRRL,RATCAT_A,IMPINCFLG_A,CVDVAC2YR_A,CVDVAC2MR_A,CVDVAC1YR_A,CVDVAC1MR_A,SHTCVD19AV_A,SHTCVD19NM_A,SHTCVD19_A,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,IMPNUM_A,WTFA_A,HHX,POVRATTC_A
0,4,7,0,,,,,,,,...,,,1,1,1,10,1,5423.324,H056808,1.93
1,4,12,0,,,,,,,,...,,,1,1,1,10,1,3832.196,H018779,4.45
2,4,14,0,,,,,,,,...,,,1,1,1,10,1,3422.661,H049265,5.94
3,3,11,0,,,,,,,,...,,,1,1,1,10,1,12960.165,H007699,3.7
4,1,6,1,,,,,,,,...,,,1,1,1,10,1,9284.618,H066034,1.66


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29482 entries, 0 to 29481
Columns: 622 entries, URBRRL to POVRATTC_A
dtypes: float64(437), int64(184), object(1)
memory usage: 139.9+ MB


# Cleaning

In [6]:
# Vars
countColNan = df.isnull().sum(axis=0).tolist() # Column NaN counts
numRows = len(df) # Number of rows
colNames = df.columns

percList = [((x / numRows)*100) for x in countColNan] # Percent Missing Data

addBool = [ind < 1  for ind in percList] 
addList = []

for i in range(len(addBool)):
    if addBool[i]:
        addList.append(colNames[i])



dfNew = df[addList].copy()
dfNew = dfNew.drop(['HHX'], axis=1)
dfNew.dropna()

print("Number of remaining Columns:", len(dfNew.columns))

goodCols = ['EDUCP_A', 'PHSTAT_A','LSATIS11R_A', 'LSATIS4R_A', 'ANXEV_A', 'DEPEV_A','WEIGHTLBTC_A','NOTCOV_A']#'WELLVIS_A']

dfPCA = dfNew[goodCols]

Number of remaining Columns: 189


In [15]:
df = pd.read_csv('adult21.csv')

# threshold for null values
null_thresh = 0.01

pruned_df = df[df.columns[df.isnull().mean() < null_thresh]]

print(len(pruned_df.columns))
# Set up the label parser
lp = label_parser()

190


# Dimension Reduction

In [35]:
targetDesc = ["Targeting by life satisfaction (chi2)","Targeting by weight (chi2)"]
targetList = ['LSATIS11R_A', 'WEIGHTLBTC_A']
dfList = ['df1','df2']


for index in range(len(targetList)):
    features = list(dfTest.columns)
    features.remove(targetList[index])

    topList = []
    y = dfTest[targetList[index]]
    X = dfTest[features]

    bestfeatures = SelectKBest(score_func=chi2, k=15)
    fit = bestfeatures.fit(X,y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)

    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  
    topList = featureScores.nlargest(15,'Score')
    dfList[index] = pd.DataFrame(topList)

    nameList = []
    for i in dfList[index]["Specs"]:
        nameList.append(lp.find_var_label(i))

    dfList[index]["Description"] = nameList
    dfList[index]["Target"] = targetDesc[index]

In [26]:
dfList[0]

Unnamed: 0,Specs,Score,Description,Target
20,WEIGHTLBTC_A,29439.183665,"Weight without shoes (pounds), public use",Targeting by life satisfaction (chi2)
9,CITZNSTP_A,11008.884074,Citizenship status,Targeting by life satisfaction (chi2)
61,NATUSBORN_A,10049.626626,Born in U.S. or U.S. territory,Targeting by life satisfaction (chi2)
15,FDSCAT3_A,9879.960904,Adult 3 category food security recode,Targeting by life satisfaction (chi2)
16,EMPWRKLSW1_A,9669.733605,Worked last week,Targeting by life satisfaction (chi2)
14,FDSCAT4_A,9225.390292,Adult 4 category food security recode,Targeting by life satisfaction (chi2)
59,INCWRKO_A,8025.874761,Income from wages,Targeting by life satisfaction (chi2)
75,TASTEDF_A,7980.873946,Difficulty tasting,Targeting by life satisfaction (chi2)
52,HOUTENURE_A,7693.939514,Residence owned/rented,Targeting by life satisfaction (chi2)
78,SMELLDF_A,7472.813458,Difficulty smelling,Targeting by life satisfaction (chi2)


In [27]:
dfList[1]

Unnamed: 0,Specs,Score,Description,Target
20,HEIGHTTC_A,24261.5874,"Height without shoes (inches), public use",Targeting by weight (chi2)
19,BMICAT_A,10956.425163,"Categorical Body Mass Index, public use",Targeting by weight (chi2)
3,PPSU,6217.966962,Pseudo-PSU for public-use file variance estima...,Targeting by weight (chi2)
50,AGEP_A,5460.670727,Age of SA (top coded),Targeting by weight (chi2)
26,LSATIS11R_A,4357.351769,Life satisfaction - 11 category question,Targeting by weight (chi2)
36,EDUCP_A,1779.558553,Educational level of sample adult,Targeting by weight (chi2)
49,SEX_A,1000.759805,Sex of Sample Adult,Targeting by weight (chi2)
22,EMERG12MTC_A,895.627899,Number of times visited hospital emergency roo...,Targeting by weight (chi2)
1,RATCAT_A,845.284013,Ratio of family income to poverty threshold fo...,Targeting by weight (chi2)
177,PHSTAT_A,735.434179,General health status,Targeting by weight (chi2)
