# Data cleaning and dataset creation

## Imports

In [117]:
# Internal functions
import clean
from split import cr_pmt_split

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data imports and cleaning

First, load the data and remove extraneous variables. Then, collapse categories at the respondent-level. Finally, move to collapse categories. 

In [118]:
# Import data
cr_df = clean.load_data("train.csv")

In [119]:
# Adjust missing values
cr_df = clean.handle_missing(cr_df)

# Collapse individual-level categories
cr_df = clean.clean_educ_cats(cr_df)
cr_df = clean.clean_marital_cats(cr_df)
cr_df = clean.clean_hhh_rel_cats(cr_df)

# Collapse household-level response categories
cr_df = clean.clean_pared_material_cats(cr_df)
cr_df = clean.clean_piso_material_cats(cr_df)
cr_df = clean.clean_sanitario_cats(cr_df)
cr_df = clean.clean_tipovivi_cats(cr_df)

## 2 | Feature engineering

This section creates features used in the analysis.

In [120]:
# TODO: Need to write these
cr_df.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4t3,escolari,...,"piso_material_mosaic, ceramic, terrazo",piso_material_other,piso_material_wood,rubbish_disposal_1 tanker truck,rubbish_disposal_3 burning,rubbish_disposal_6 other,tipovivi_fully paid,tipovivi_other,tipovivi_own,tipovivi_rented
0,ID_279628684,190000.0,0,3,0,1,1,0.0,1,10,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1.0,1,12,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,ID_68de51c94,0.0,0,8,0,1,1,0.0,1,11,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,ID_d671db89c,180000.0,0,5,0,1,1,1.0,4,9,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1.0,4,11,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## 3 | Train-test split

Collapse the dataset into the household-level and create training, validation, and test sets to train models.

In [121]:
# Collapse data to household-level 
cr_df = clean.collapse_df(cr_df)
cr_df = clean.drop_indiv_vars(cr_df)

In [122]:
# Conduct split
X_train, y_train, X_val, y_val, X_test, y_test = cr_pmt_split(cr_df)

Training set prior to SMOTE: 2390
Training set size after SMOTE: 5463
Validation set size after SMOTE: 781
Test set size prior to SMOTE 598
Test set size after SMOTE: 1576


In [123]:
# Conduct split for CV
X_train, y_train, X_test, y_test = cr_pmt_split(cr_df, cv = True)

Training set size prior to SMOTE (prior to CV): 2390
Training set size after SMOTE (prior to CV): 6244
Test set size: 1576


## 4 | Model tuning

We tune 3 sets of models and store the various results in a list.

In [None]:
# Tune 3 sets of models