# Splitting into train and test.
I am splitting into train and test sets here.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [2]:
contra = pd.read_csv('../../1-data/contraceptive_for_students.csv')
this_dic = {1:0, 3:1, 2:2}
contra['contraceptive'] = contra['contraceptive'].map(this_dic)

In [4]:
contra.head(5)

Unnamed: 0,wife_age,wife_education,husband_education,num_child,wife_religion,wife_work,husband_occupation,standard_living,media_exposure,contraceptive
0,24,2,3,3,1,1,2,3,0,0
1,45,1,3,10,1,1,3,4,0,0
2,43,2,3,7,1,1,3,4,0,0
3,42,3,2,9,1,1,3,3,0,0
4,36,3,3,8,1,1,3,2,0,0


## Feature Engineering

In [5]:
age_intervals = pd.IntervalIndex.from_tuples([(20, 24), (24, 29), (29, 34), (34,39), (39,44), (44, 49)])
age_interval_df = pd.DataFrame(age_intervals)

In [6]:
age_interval_df['median_marriage_age'] = [19.6, 18.1, 17.6, 16.8, 16.4, 16.5]

In [7]:
age_interval_df = age_interval_df.rename({0:'age_bin'}, axis=1)
age_interval_df['age_bin'] = age_interval_df['age_bin'].astype(str)

In [8]:
age_interval_df

Unnamed: 0,age_bin,median_marriage_age
0,"(20, 24]",19.6
1,"(24, 29]",18.1
2,"(29, 34]",17.6
3,"(34, 39]",16.8
4,"(39, 44]",16.4
5,"(44, 49]",16.5


In [9]:
lows = []
highs = []

for ix in range(age_interval_df.shape[0]):
    lo_hi = [re.sub('\(|\]', '', k) for k in age_interval_df.age_bin[ix].split(',')]
    lows.append(lo_hi[0])
    highs.append(lo_hi[1])

In [10]:
age_interval_df['age_bin_low']  = lows
age_interval_df['age_bin_high'] = highs

We are using `pd.cut` to create the age ranges within the initial dataset.

In [12]:
contra['age_bin'] = pd.cut(contra.wife_age, bins=age_intervals)

# FILL THE NA'S
contra['age_bin'] = contra['age_bin'].cat.add_categories('None')
contra['age_bin'] = contra['age_bin'].fillna('None')
contra['age_bin'] = contra['age_bin'].astype(str)

In [15]:
lows = []
highs = []

for ix in range(contra.shape[0]):
    if contra.age_bin[ix] == 'None':
        lows.append('None')
        highs.append('None')
    else:
        lo_hi = [re.sub('\(|\]|\.0', '', k) for k in contra.age_bin[ix].split(',')]
        lows.append(lo_hi[0])
        highs.append(lo_hi[1])

In [16]:
contra['age_bin_low']  = lows
contra['age_bin_high'] = highs

In [18]:
contra = contra.merge(age_interval_df)

We are creating a column called `est_years_married` that is the wife's age minus the median marriage age for her age group.

In [19]:
contra['est_years_married'] = contra['wife_age'] - contra['median_marriage_age']

We are creating `kids_per_year` which is the amount of kids divided by the number of estimated years married.

In [20]:
contra['kids_per_year'] = contra['num_child'] / contra['est_years_married']

We also are creating `education_gap` which is the level of husband's education minus the level of the wife's.

In [21]:
contra['education_gap'] = contra['husband_education'] - contra['wife_education']

We are also adding in a binary `contraceptive_use` covariate that is `0` for if a woman does not use contraception and `1` if she does.

In [22]:
this_dic = {0:0, 1:1, 2:1}
contra['contraceptive_use'] = contra['contraceptive'].map(this_dic)

The interaction effect between husband and wife's work.

In [24]:
contra['edu_interaction'] = contra['wife_education'] * contra['husband_education']

## Split Train and Test

In [21]:
contra_train, contra_test = train_test_split(contra, test_size=0.25, random_state=100)

## Save

In [22]:
contra.to_csv('../../1-data/contra.csv', index=False)
contra_train.to_csv('../../1-data/train.csv', index=False)
contra_test.to_csv('../../1-data/test.csv', index=False)