In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def visualize(data, x, y):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
    sns.stripplot(x, y, data=data, jitter=True, size=4, ax=axes[0][0])
    sns.factorplot(x=x, y=y, data=data, kind='bar', ax=axes[0][1], sharex=False, sharey=False, legend=False)
    data.boxplot(column=y, by=x, ax=axes[1][0])
    sns.violinplot(x=x, y=y, data=data, ax=axes[1][1])
    
    # Remove the extra plot that factorplot draws
    plt.close(2)
    plt.show()

In [3]:
curr_health = pd.read_csv('../2015-2016_nhanes_raw/questionnaire/current_health.csv',
                         usecols=['SEQN', 'HSD010', 'HSQ500'])

sex_hormones = pd.read_csv('../2015-2016_nhanes_raw/laboratory/sex_steroid_hormone.csv',
                          usecols=['SEQN', 'LBXTST', 'LBXEST', 'LBXSHBG'])

biochem_profile = pd.read_csv('../2015-2016_nhanes_raw/laboratory/standard_biochem_profile.csv',
                             usecols=['SEQN', 'LBXSBU', 'LBXSC3SI', 'LBXSCA',
                                      'LBXSCH', 'LBXSCLSI', 'LBXSGL',
                                      'LBXSIR', 'LBXSKSI', 'LBXSNASI', 
                                      'LBXSTP', 'LBXSTR', 'LBXSUA'])

glycohemoglobin = pd.read_csv('../2015-2016_nhanes_raw/laboratory/glycohemoglobin.csv')

In [4]:
display(curr_health.HSD010.value_counts())

3.0    2493
2.0    1652
4.0    1212
1.0     606
5.0     203
Name: HSD010, dtype: int64

In [6]:
joined_a = curr_health
joined_b = pd.merge(joined_a, sex_hormones, on='SEQN')
joined_c = pd.merge(joined_b, glycohemoglobin, on='SEQN')
joined = pd.merge(joined_c, biochem_profile, on='SEQN')

In [7]:
joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6744 entries, 0 to 6743
Data columns (total 19 columns):
SEQN        6744 non-null float64
HSD010      6166 non-null float64
HSQ500      6166 non-null float64
LBXTST      6235 non-null float64
LBXEST      6186 non-null float64
LBXSHBG     6039 non-null float64
LBXGH       6326 non-null float64
LBXSBU      6255 non-null float64
LBXSC3SI    6257 non-null float64
LBXSCA      6257 non-null float64
LBXSCH      6254 non-null float64
LBXSCLSI    6257 non-null float64
LBXSGL      6257 non-null float64
LBXSIR      6253 non-null float64
LBXSKSI     6257 non-null float64
LBXSNASI    6257 non-null float64
LBXSTP      6253 non-null float64
LBXSTR      6254 non-null float64
LBXSUA      6254 non-null float64
dtypes: float64(19)
memory usage: 1.0 MB


In [8]:
joined.to_csv('./alijah_compiled_raw.csv', index=False)

In [None]:
display(joined.groupby('HSD010').mean())
print('\n')
joined.groupby('HSD010').std()

In [None]:
display(joined.groupby('happiness_3').mean())
print('\n')
joined.groupby('happiness_3').std()

In [None]:
display(joined.corr()['HSD010'][2:])
(joined.corr()['HSD010'][2:]).plot.bar()

In [None]:
display(joined.corr()['happiness_3'][4:])
(joined.corr()['happiness_3'][4:]).plot.bar()

In [None]:
visualize(joined, 'happiness_3', 'LBXGH')

In [None]:
visualize(joined, 'happiness_3', 'LBXSIR')

In [None]:
visualize(joined, 'HSD010', 'LBXSBU')

In [None]:
visualize(joined, 'happiness_3', 'LBXEST')

In [None]:
visualize(joined, 'HSD010', 'LBXSTR')

In [None]:
visualize(joined, 'HSD010', 'LBXSTR')

In [None]:
visualize(joined, 'HSD010', 'BMXBMI')