# Feature Data

### This notebook serves as demonstration of how our data is created and provides an overview of what kind of information is available in each feature in the data set

In [1]:
import train
import build_features

import pickle
import numpy as np
import pandas as pd
import importlib as imp
from scipy.stats import norm

pd.options.display.max_columns = 500
pd.options.display.max_rows = 20000
pd.options.display.width = 20000
pd.options.display.float_format = '{:,.4f}'.format

## Load the data dictionaries

In [2]:
d1 = pickle.load(open('../python objects/patientdata_20170823.pkl', 'rb'))
d1mom = pickle.load(open('../python objects/patient_mother_data_20170724.pkl', 'rb'))
lat_lon_dic = pickle.load(open('../python objects/lat_lon_data_20180329.pkl', 'rb'))
env_dic= pickle.load(open('../python objects/census_data_20170920.pkl', 'rb'))
d1mom_hist = pickle.load(open('../python objects/full_lutheran_mother_data.pkl', 'rb'))

## Set the data creation parameters

In [3]:
agex_low = 4.5
agex_high = 5.5
months_from = 0
months_to = 24

label_ix = {'underweight':0,'normal':1,'overweight':2,'obese':3,'class I severe obesity':4,'class II severe obesity':5}

## Convert the data from a nested dictionary format to a user-friendly matrix format

In [None]:
x1,y1,y1label,feature_headers,mrns = build_features.call_build_function(d1, d1mom, d1mom_hist, lat_lon_dic, env_dic, agex_low, agex_high, months_from, months_to, False, prediction='multi')

Processing 52,945 patients: 33268it [01:33, 357.03it/s]   

In [None]:
print('Number of patients: {0:,d}'.format(int(x1.shape[0])))
print('Number of features: {0:,d}'.format(int(x1.shape[1])))

In [None]:
print('Total number of children in cohort: {0:,d}'.format(int(y1.shape[0])))
print('Total number of eligible children at age 5 (4.5-5.5): {0:,d}'.format(int(y1label.sum())))
print('Total number of ineligible children at age 5 (4.5-5.5): {0:,d}'.format(int(y1.shape[0] - y1label.sum())))

## Summarize the number of features by category

In [None]:
ft_info = []
ft_cats = {}

for f in feature_headers:
    try:
        ft_cats[f.split(':')[0]] += 1
    except:
        ft_cats[f.split(':')[0]] = 1
        
xsum = x1.sum(axis=0)
for k in ft_cats:
    cols = [f.startswith(k) for f in feature_headers]
    ft_info.append([k, ft_cats[k], (xsum[cols] > 0).sum(), (xsum[cols] >= 5).sum()])

ft_info = pd.DataFrame(ft_info, columns=['Feature Category', 'Number of Features', 'Number of Features with >0 Occurrences', 'Number of Features with >=5 Occurrences'])
ft_info.to_csv('../summary_statistics/feature_categories.csv', index=False)
ft_info

## Summarize the amount of information available in each feature

In [None]:
ft_info[['Number of Features','Number of Features with >0 Occurrences','Number of Features with >=5 Occurrences']].sum(axis=0)

In [None]:
xsum = (x1 > 0).sum(axis=0)
for f, s in zip(feature_headers, xsum):
    print('{0:s} has {1:,d} occurrences'.format(f,s))

In [None]:
print('Average number of occurrences per feature {0:4.2f} with standard deviation: {1:4.2f}'.format(np.mean(xsum), np.std(xsum)))

## Create the valid study cohort and output the relevant information for each feature if there are at least 5 occurrences in the data

**NOTE: in this step there will be less features with at least 5 occurrences because we are also filtering rows that do not have maternal data or do not have a valid BMI reading (10 > BMI < 40) in the data.**

In [None]:
x2, y2, y2label, mrns2, ix_filter, feature_headers2, corr_headers_filtered, corrs_matrix_filtered, ix_corr_headers = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=[], # use both boys and girls
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

In [None]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2):
        bin_indicator = x2[:,ix].max()==1 and x2[:,ix].min()==0

        ix_total = (x2[:,ix] != 0)
        ix_total_pos = (y2label > 0) & (x2[:,ix] != 0)
        ix_total_neg = (y2label == 0) & (x2[:,ix] != 0)

        De = sum((y2label > 0) & (x2[:,ix] != 0)) * 1.0
        He = sum((y2label == 0) & (x2[:,ix] != 0)) * 1.0
        Dn = sum((y2label > 0) & (x2[:,ix] == 0)) * 1.0
        Hn = sum((y2label == 0) & (x2[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2[ix_total_pos,:][:,ix].mean() - x2[ix_total_neg,:][:,ix].mean()
        se = np.sqrt( np.var(x2[ix_total_pos,:][:,ix]) / len(x2[ix_total_pos,:][:,ix]) + np.var(x2[ix_total_neg,:][:,ix])/len(x2[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2[ix_total,:][:,ix].mean(), x2[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0, RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_feature_summary_table.csv', index=False)

In [None]:
ft_info.sort_values(by='Variable')

### Create the boys subset of the data and output relevant information for each feature that has at least 5 occurrences in the data

In [None]:
x2_boys, y2_boys, y2label_boys, mrns2_boys, ix_filter_boys, feature_headers2_boys, corr_headers_filtered_boys, corrs_matrix_filtered_boys, ix_corr_headers_boys = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=['Gender:0'], # only the boys data
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

In [None]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label_boys > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2_boys):
        bin_indicator = x2_boys[:,ix].max()==1 and x2_boys[:,ix].min()==0

        ix_total = (x2_boys[:,ix] != 0)
        ix_total_pos = (y2label_boys > 0) & (x2_boys[:,ix] != 0)
        ix_total_neg = (y2label_boys == 0) & (x2_boys[:,ix] != 0)

        De = sum((y2label_boys > 0) & (x2_boys[:,ix] != 0)) * 1.0
        He = sum((y2label_boys == 0) & (x2_boys[:,ix] != 0)) * 1.0
        Dn = sum((y2label_boys > 0) & (x2_boys[:,ix] == 0)) * 1.0
        Hn = sum((y2label_boys == 0) & (x2_boys[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2_boys[ix_total_pos,:][:,ix].mean() - x2_boys[ix_total_neg,:][:,ix].mean()
        se = np.sqrt(np.var(x2_boys[ix_total_pos,:][:,ix]) / len(x2_boys[ix_total_pos,:][:,ix]) + np.var(x2_boys[ix_total_neg,:][:,ix])/len(x2_boys[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2_boys[ix_total,:][:,ix].mean(), x2_boys[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2_boys[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2_boys[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2_boys[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2_boys[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0,
                RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_boys_feature_summary_table.csv', index=False)

In [None]:
ft_info

### Create the girls subset of the data and output relevant information for each feature that has at least 5 occurrences in the data

In [None]:
x2_girls, y2_girls, y2label_girls, mrns2_girls, ix_filter_girls, feature_headers2_girls, corr_headers_filtered_girls, corrs_matrix_filtered_girls, ix_corr_headers_girls = \
    train.prepare_data_for_analysis({}, {}, {}, {}, {},
        x1, y1, y1label[:,label_ix['obese']], feature_headers, mrns,
        agex_low, agex_high, months_from, months_to,
        filterSTR=['Gender:1'], # only the girls data
        variablesubset=[], # do not remove any features
        do_impute=False, # do not impute values
        do_normalize=False, # do not normalize the values
        min_occur=5, # use only features with meaningful information
        delay_print=False, # print out all information as it's available
        lasso_selection=False # do not use LASSO feature selection
    )

In [None]:
char_table = []
cols = ['Variable', 'Total N', 'Total Average', 'Total Std Dev', 'Obese N', 'Obese Average', 'Obese Std Dev', 'Not Obese N', 'Not Obese Average', 'Not Obese Std Dev', 'Unadjusted Odds Ratio', 'Unadjusted OR Low', 'Unadjusted OR High', 'Relative Risk', 'p-value for OR']
y2pos_ix = (y2label_girls > 0)
with np.errstate(divide='ignore', invalid='ignore'):
    for ix, h in enumerate(feature_headers2_girls):
        bin_indicator = x2_girls[:,ix].max()==1 and x2_girls[:,ix].min()==0

        ix_total = (x2_girls[:,ix] != 0)
        ix_total_pos = (y2label_girls > 0) & (x2_girls[:,ix] != 0)
        ix_total_neg = (y2label_girls == 0) & (x2_girls[:,ix] != 0)

        De = sum((y2label_girls > 0) & (x2_girls[:,ix] != 0)) * 1.0
        He = sum((y2label_girls == 0) & (x2_girls[:,ix] != 0)) * 1.0
        Dn = sum((y2label_girls > 0) & (x2_girls[:,ix] == 0)) * 1.0
        Hn = sum((y2label_girls == 0) & (x2_girls[:,ix] == 0)) * 1.0

        OR = (De/He)/(Dn/Hn)
        OR_sterror = np.sqrt(1/De + 1/He + 1/Dn + 1/Hn)
        OR_low, OR_high = np.exp(np.log(OR) - 1.96*OR_sterror), np.exp(np.log(OR) + 1.96*OR_sterror)

        RR = (De/(De+He))/(Dn/(Dn+Hn))

        md = x2_girls[ix_total_pos,:][:,ix].mean() - x2_girls[ix_total_neg,:][:,ix].mean()
        se = np.sqrt(np.var(x2_girls[ix_total_pos,:][:,ix]) / len(x2_girls[ix_total_pos,:][:,ix]) + np.var(x2_girls[ix_total_neg,:][:,ix])/len(x2_girls[ix_total_neg,:][:,ix]))
        lcl, ucl = md-2*se, md+2*se
        z = md/se

        pvalue = 2 * norm.cdf(-1*(np.abs(np.log(OR))/OR_sterror)) if bin_indicator else 2 * norm.cdf(-np.abs(z))
        char_table.append([
                h, ix_total.sum(), x2_girls[ix_total,:][:,ix].mean(), x2_girls[ix_total,:][:,ix].std(),
                ix_total_pos.sum(), x2_girls[ix_total_pos,:][:,ix].mean() if not(bin_indicator) else 0, x2_girls[ix_total_pos,:][:,ix].std() if not(bin_indicator) else 0,
                ix_total_neg.sum(), x2_girls[ix_total_neg,:][:,ix].mean() if not(bin_indicator) else 0,  x2_girls[ix_total_neg,:][:,ix].std() if not(bin_indicator) else 0,
                OR if bin_indicator else 0, OR_low if bin_indicator else 0, OR_high if bin_indicator else 0,
                RR if bin_indicator else 0, pvalue
        ])

ft_info = pd.DataFrame(char_table, columns=cols)
ft_info.to_csv('../summary_statistics/cohort_girls_feature_summary_table.csv', index=False)

In [None]:
ft_info