In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Data/df_cleaned2.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT',
       'BMXHT', 'BMXBMI', 'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT',
       'DR1TCARB', 'DR1TSUGR', 'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT',
       'DR1TPFAT', 'DR1TCHOL', 'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON',
       'DR1TPOTA', 'DR1_300', 'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020',
       'BPQ050A', 'BPQ080', 'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610',
       'PAD615', 'PAQ625', 'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675',
       'Systolic', 'Diastolic', 'outlier'],
      dtype='object')

In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df.columns

Index(['LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT', 'BMXHT', 'BMXBMI',
       'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA', 'DR1_300',
       'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610', 'PAD615', 'PAQ625',
       'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675', 'Systolic',
       'Diastolic', 'outlier'],
      dtype='object')

# Engineering physicial activity data for work and recreation

According to the codebook, the amount of physical activity comes with a MET score for weighing.  Vigorous activity gets a score of 8 while moderate activity gets a score of 4.  
What I plan to do is calculate the daily average MET amount of minutes for physical activity for both work and rescreational activity by first multiplying the vigours time by 8, moderate amounts by 4, and adding them together.  Then I divide by 7 to get a daily average.

In [5]:
df['MET_work'] = 8*df.PAQ610*df.PAD615/7 + 4*df.PAQ625*df.PAD630/7
df['MET_rec'] = 8*df.PAQ655*df.PAD660/7 + 4*df.PAQ670*df.PAD675/7

# Weighed nutrition values



In [6]:
df.DR1_300.unique()

array([2., 3., 1.])

In [7]:
df.DR1_300.value_counts()

2.0    2530
3.0     698
1.0     310
Name: DR1_300, dtype: int64

With DR1_300 1 means more than usual was consumed, 2 means usual amount was consumed and 3 means much less than usual was consumed.  
For those where more than usuals was consumed.  I plan to create transformed nutrient information by multiplying the values by 0.5.  When less than usual was consumed, I plan to multiply the variables by 2.

In [8]:
powers = df.DR1_300 - 2
nutrtition_variables = ['DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA']

In [9]:
transformed_variables = []
for var in nutrtition_variables:
    name = "{}_t".format(var)
    transformed_variables.append(name)
    df[name] = df[var]*2**powers

In [10]:
#just checking to make sure it worked
df[nutrtition_variables[:3]+['DR1_300']+transformed_variables[:3]].head()

Unnamed: 0,DR1TKCAL,DR1TPROT,DR1TCARB,DR1_300,DR1TKCAL_t,DR1TPROT_t,DR1TCARB_t
0,2964.0,62.36,356.85,2.0,2964.0,62.36,356.85
1,604.0,19.3,90.3,3.0,1208.0,38.6,180.6
2,1304.0,79.02,153.43,2.0,1304.0,79.02,153.43
3,2338.0,102.1,282.58,2.0,2338.0,102.1,282.58
4,3983.0,125.18,492.2,2.0,3983.0,125.18,492.2


In [11]:
df.columns

Index(['LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT', 'BMXHT', 'BMXBMI',
       'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA', 'DR1_300',
       'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610', 'PAD615', 'PAQ625',
       'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675', 'Systolic',
       'Diastolic', 'outlier', 'MET_work', 'MET_rec', 'DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t'],
      dtype='object')

# Average daily alcohol consumption
ALQ120Q asks how often do you drink alcohol in the past year
ALQ120U asks the unit where
1 means week
2 means month
3 means year

I am going to replace week with 52, month with 12, year with 1 then multiiply these balues by ALQ120Q and the by ALQ130 (how many drinks on days you drink) then divide by 365 to get the daily average

In [12]:
df['avgALC'] = df.ALQ120Q*df.ALQ120U.replace({1:52,2:12,3:1})*df.ALQ130/365

## Log transform the dependant variable

As I said in my eda, I may try with log of LBXTC

In [13]:
df['log_LBXTC']=np.log(df.LBXTC)

In [14]:
df.to_csv('../Data/df_transformed.csv')