# Header Block

In [16]:
# This header will be the same no matter what code you are using
# import modules that we will use multiple functions from and give them short names. 

import pandas as pd;
import numpy as np;
import seaborn as sb;
import matplotlib.pyplot as plt;


# import single functions

from scipy.stats.contingency import chi2_contingency;
from itertools import combinations;
from statsmodels.graphics.mosaicplot import mosaic;
from scipy.stats.contingency import chi2_contingency;
from scipy.stats import pearsonr;


# Data Management

We begin our data management by renaming the variables to something that is easier to remember and work with. 

In [17]:
# Import Variables
addhealth_url = 'https://drive.google.com/uc?export=download&id=1LOoZl4utpqTfKjj6nu70RH16frFLyPfm'

myData = pd.read_csv(addhealth_url, usecols=['H4WP13','H4BMI'],low_memory=False)

# Rename Variables

myData.rename(columns={
    'H4WP13':'female_parent',
    'H4BMI':'bmi'
},inplace=True)

# Deal with non-responses and legitimate skips

myData['female_parent'].replace({
    96:np.nan,
    98:np.nan,
},inplace=True)

myData['bmi'].replace({
    888:np.nan,
    889:np.nan,
    996:np.nan,
    998:np.nan,
    997:np.nan,
},inplace=True)

I like to look at some examples of the responses to make sure that my renaming worked and that the data makes sense. 

In [18]:
myData.head(15)

Unnamed: 0,female_parent,bmi
0,1.0,35.2
1,1.0,26.4
2,1.0,26.2
3,1.0,28.5
4,1.0,43.8
5,1.0,28.0
6,1.0,18.4
7,1.0,24.8
8,1.0,30.1
9,1.0,30.8


Below is the function which will recode the variable according to our re-grouping. The if, elif, else code is what you will modify to fit your data. This will often be placed into your data management block. 

In [19]:
def collapse_var(row):
  if (row['female_parent'] in [3,4]):
    return 1
  elif (row['female_parent'] in [1,2,5,6,7,8,9,10,11]):
    return 0

myData['step_mother'] = myData.apply(lambda row: collapse_var(row), axis=1)

Below I am going to check and see if the data is being grouped correctly. THIS IS A SUPER IMPORTANT STEP!!!

In [20]:
myData.head(40)

Unnamed: 0,female_parent,bmi,step_mother
0,1.0,35.2,0.0
1,1.0,26.4,0.0
2,1.0,26.2,0.0
3,1.0,28.5,0.0
4,1.0,43.8,0.0
5,1.0,28.0,0.0
6,1.0,18.4,0.0
7,1.0,24.8,0.0
8,1.0,30.1,0.0
9,1.0,30.8,0.0


ALSO CHECK THE FREQUENCY TABLES!! Make sure that the values are correctly re-organized. 

In [21]:
myData['step_mother'].value_counts()

0.0    5045
1.0      67
Name: step_mother, dtype: int64

In [22]:
myData['female_parent'].value_counts()

1.0     4671
6.0      203
2.0       67
4.0       61
7.0       43
11.0      25
8.0       13
10.0       9
5.0        8
9.0        6
3.0        6
Name: female_parent, dtype: int64

Next we are going to handle a quantative variable. Fortunately pandas has the .cut command which does most of the work. The code below re-labels the data by bins for the bmi variable. 

In [23]:
myData['bmi2'] = pd.cut(myData['bmi'], bins=[0,18.5,25,30,1000], labels=[1,2,3,4])

In [24]:
myData['bmi2'].value_counts()

4    1879
2    1598
3    1494
1      86
Name: bmi2, dtype: int64

In [25]:
myData.head(30)

Unnamed: 0,female_parent,bmi,step_mother,bmi2
0,1.0,35.2,0.0,4
1,1.0,26.4,0.0,3
2,1.0,26.2,0.0,3
3,1.0,28.5,0.0,3
4,1.0,43.8,0.0,4
5,1.0,28.0,0.0,3
6,1.0,18.4,0.0,1
7,1.0,24.8,0.0,2
8,1.0,30.1,0.0,4
9,1.0,30.8,0.0,4
