# Splitting into train and test.
I am feature engineering then splitting into train and test sets here.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [2]:
contra = pd.read_csv('../../1-data/contraceptive_for_students.csv')
this_dic = {1:0, 3:1, 2:2}
contra['contraceptive'] = contra['contraceptive'].map(this_dic)

In [3]:
contra.head(5)

Unnamed: 0,wife_age,wife_education,husband_education,num_child,wife_religion,wife_work,husband_occupation,standard_living,media_exposure,contraceptive
0,24,2,3,3,1,1,2,3,0,0
1,45,1,3,10,1,1,3,4,0,0
2,43,2,3,7,1,1,3,4,0,0
3,42,3,2,9,1,1,3,3,0,0
4,36,3,3,8,1,1,3,2,0,0


# Feature Engineering.

## 1. Kids Per Married Year (KPMY)

First, we remove entries where the wife's age is less than 20, exactly **[34]** records. This was done to incorporate information from the original survey pamphlet which listed the median marriage ages for age groups. The median age of our dataset was raised by 1 year due to the loss in 2% of our dataset. This loss was made reasonable by the fact that number of children and age are highly associated, and we wanted to use the information on birth rates while married ("kids per married year").

In [4]:
contra = contra[contra.wife_age > 20]
contra = contra.reset_index(drop=True)

In [5]:
age_intervals = pd.IntervalIndex.from_tuples([(20, 24), (24, 29), (29, 34), (34,39), (39,44), (44, 49)])
age_interval_df = pd.DataFrame(age_intervals)

In [6]:
age_interval_df['median_marriage_age'] = [19.6, 18.1, 17.6, 16.8, 16.4, 16.5]

In [7]:
age_interval_df = age_interval_df.rename({0:'age_bin'}, axis=1)
age_interval_df['age_bin'] = age_interval_df['age_bin'].astype(str)

The following function cleans the lower bound and upper bound of the intervals into their own columns.

In [8]:
def cleanIntervals(age_intervals):
    '''
    Input:
        age_intervals: Array of intervals
    Output:
        lows: An array of lower bounds
        highs: An array of upper bounds
    '''
    
    lows  = []
    highs = []
    
    for ix in range(len(age_intervals)):
        lo_hi = [re.sub('\(|\]', '', k) for k in age_intervals[ix].split(',')]
        lows.append(lo_hi[0])
        highs.append(lo_hi[1])
        
    return(np.array(lows),
           np.array(highs))

In [9]:
age_interval_df['age_bin_low'], age_interval_df['age_bin_high'] = cleanIntervals(age_interval_df.age_bin)

We are using `pd.cut` to create the age ranges within the initial dataset.

In [10]:
# age_intervals = pd.IntervalIndex.from_tuples([(20, 24), (24, 29), (29, 34), (34,39), (39,44), (44, 49)])
contra['age_bin'] = pd.cut(contra.wife_age, bins=age_intervals)

# FILL THE NA'S
contra['age_bin'] = contra['age_bin'].cat.add_categories('None')
contra['age_bin'] = contra['age_bin'].fillna('None')
contra['age_bin'] = contra['age_bin'].astype(str)

In [11]:
contra['age_bin_low'], contra['age_bin_high'] = cleanIntervals(contra.age_bin)

In [12]:
contra = contra.merge(age_interval_df)

We are creating a column called `est_years_married` that is the wife's age minus the median marriage age for her age group.

In [13]:
contra['est_years_married'] = contra['wife_age'] - contra['median_marriage_age']

We are creating `kids_per_year` which is the amount of kids divided by the number of estimated years married.

In [14]:
contra['kids_per_year'] = contra['num_child'] / contra['est_years_married']

Finally, I'm just going to drop the unnecessary `age_bin` feature since we already have the lower and upper bounds in the dataset.

In [15]:
contra.drop(['age_bin'], axis=1, inplace=True)

In [16]:
contra.head(2)

Unnamed: 0,wife_age,wife_education,husband_education,num_child,wife_religion,wife_work,husband_occupation,standard_living,media_exposure,contraceptive,age_bin_low,age_bin_high,median_marriage_age,est_years_married,kids_per_year
0,24,2,3,3,1,1,2,3,0,0,20,24,19.6,4.4,0.681818
1,21,3,3,1,1,0,3,2,0,0,20,24,19.6,1.4,0.714286


## 2. Education gap.
We also are creating `education_gap` which is the level of husband's education minus the level of the wife's.

In [17]:
contra['education_gap'] = contra['husband_education'] - contra['wife_education']

Here is a little information on `education_gap`.

In [18]:
import seaborn as sns
sns.boxplot(contra['education_gap']);

In [19]:
contra.education_gap.value_counts().to_frame().sort_index()

Unnamed: 0,education_gap
-3,1
-2,9
-1,63
0,779
1,391
2,144
3,22


I am creating a categorical version of the above variable `education_gap_categorical` because while there are gaps between these ordinal levels of education, the interval widths are subject to educational standards and are not numerical. However, the magnitude embedded into the variable `education_gap` could still be useful in predictions, therefore we will have both available for use in the dataset.

In [20]:
gap_intervals = [-3, 0, 1, 4]
contra['education_gap_categorical'] = pd.cut(contra.education_gap, bins=gap_intervals, labels=[-1, 0, 1], right=False)

In [21]:
contra.education_gap_categorical.value_counts().to_frame().sort_index()

Unnamed: 0,education_gap_categorical
-1,73
0,779
1,557


## 3. Contraceptive Use.
We are also adding in a binary `contraceptive_use` covariate that is `0` for if a woman does not use contraception and `1` if she does.

In [22]:
this_dic = {0:0, 1:1, 2:1}
contra['contraceptive_use'] = contra['contraceptive'].map(this_dic)

## 4. Adjusted standard of living.
The below code combines middle-low and middle-high standards of living into a single category. We now have 3 total levels for standard of living.

In [23]:
this_dic = {1: 1, 2:2, 3:2, 4:3}
contra['standard_living'] = contra['standard_living'].map(this_dic)

## 5. Adjusted education level.
The below code separates wives' education level into `0` for not having completed primary school and `1` for completing primary school. This was informed by the survey pamphlet.

In [24]:
this_dic = {1:0, 2:0, 3:1, 4:1}
contra['wife_education'] = contra['wife_education'].map(this_dic)

In [25]:
contra_untransf = contra

## 6. One-hot encording categorical variables.
We are one-hot encoding the categorical variables, which are:

- wife_education
- husband_education
- wife_religion
- wife_work
- husband_occupation
- standard_living
- media_exposure
- education_gap_categorical

In [26]:
contra = pd.get_dummies(contra,
                        columns=['wife_education', 'husband_education', 'wife_religion',
                                 'wife_work', 'husband_occupation', 'standard_living',
                                 'media_exposure', 'education_gap_categorical'],
                        drop_first=True)

In [27]:
contra.head()

Unnamed: 0,wife_age,num_child,contraceptive,age_bin_low,age_bin_high,median_marriage_age,est_years_married,kids_per_year,education_gap,contraceptive_use,...,wife_religion_1,wife_work_1,husband_occupation_2,husband_occupation_3,husband_occupation_4,standard_living_2,standard_living_3,media_exposure_1,education_gap_categorical_0,education_gap_categorical_1
0,24,3,0,20,24,19.6,4.4,0.681818,1,0,...,1,1,1,0,0,1,0,0,0,1
1,21,1,0,20,24,19.6,1.4,0.714286,0,0,...,1,0,0,1,0,1,0,0,1,0
2,24,0,0,20,24,19.6,4.4,0.0,1,0,...,1,0,0,1,0,0,0,0,0,1
3,24,0,0,20,24,19.6,4.4,0.0,0,0,...,1,0,1,0,0,1,0,0,1,0
4,24,1,0,20,24,19.6,4.4,0.227273,1,0,...,1,1,0,1,0,1,0,0,0,1


In [28]:
list(contra.columns)

['wife_age',
 'num_child',
 'contraceptive',
 'age_bin_low',
 'age_bin_high',
 'median_marriage_age',
 'est_years_married',
 'kids_per_year',
 'education_gap',
 'contraceptive_use',
 'wife_education_1',
 'husband_education_2',
 'husband_education_3',
 'husband_education_4',
 'wife_religion_1',
 'wife_work_1',
 'husband_occupation_2',
 'husband_occupation_3',
 'husband_occupation_4',
 'standard_living_2',
 'standard_living_3',
 'media_exposure_1',
 'education_gap_categorical_0',
 'education_gap_categorical_1']

## 7. Scaling continuous variables.
The remaining variables (aside from the response variables, `contraceptive` and `contraceptive_use`, and the helper variables, `median_marriage_age`, `age_bin_low,` and `age_bin_high`) will be standardized.

- wife_age
- num_child
- kids_per_year
- est_years_married

In [29]:
continuous_vars = ['wife_age', 'num_child', 'kids_per_year', 'est_years_married']

In [30]:
from sklearn.preprocessing import StandardScaler
standardized_vars = pd.DataFrame(StandardScaler().fit_transform(contra[continuous_vars]), columns=continuous_vars)

In [31]:
contra.drop(continuous_vars, axis=1, inplace=True)

In [32]:
contra = contra.join(standardized_vars)

## Split Train and Test

In [33]:
contra_train, contra_test = train_test_split(contra, test_size=0.25, random_state=100)
contra_train_untransf, contra_test_untransf = train_test_split(contra_untransf, test_size=0.25, random_state=100)

## Save

In [34]:
[contra.shape, contra_train.shape, contra_test.shape]

[(1409, 24), (1056, 24), (353, 24)]

In [35]:
contra.to_csv('../../1-data/contra.csv', index=False)
contra_train.to_csv('../../1-data/train.csv', index=False)
contra_test.to_csv('../../1-data/test.csv', index=False)

contra_untransf.to_csv('../../1-data/contra-untransf.csv', index=False)
contra_train_untransf.to_csv('../../1-data/train-untransf.csv', index=False)
contra_test_untransf.to_csv('../../1-data/test-untrasf.csv', index=False)