# Data cleaning and dataset creation

## Imports

In [176]:
# Internal functions
import clean
import tuning
from split import cr_pmt_split

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data imports and cleaning

First, load the data and remove extraneous variables. Then, collapse categories at the respondent-level. Finally, move to collapse categories. 

In [177]:
# Import data
cr_df = clean.load_data("train.csv")

In [178]:
# Adjust missing values
cr_df = clean.handle_missing(cr_df)

# Collapse individual-level categories
cr_df = clean.clean_educ_cats(cr_df)
cr_df = clean.clean_marital_cats(cr_df)
cr_df = clean.clean_hhh_rel_cats(cr_df)

# Collapse household-level response categories
cr_df = clean.clean_pared_material_cats(cr_df)
cr_df = clean.clean_piso_material_cats(cr_df)
cr_df = clean.clean_sanitario_cats(cr_df)
cr_df = clean.clean_tipovivi_cats(cr_df)

## 2 | Feature engineering

This section creates features used in the analysis.

In [179]:
# TODO: Need to write these
cr_df.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4t3,escolari,...,"piso_material_mosaic, ceramic, terrazo",piso_material_other,piso_material_wood,rubbish_disposal_1 tanker truck,rubbish_disposal_3 burning,rubbish_disposal_6 other,tipovivi_fully paid,tipovivi_other,tipovivi_own,tipovivi_rented
0,ID_279628684,190000.0,0,3,0,1,1,0.0,1,10,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1.0,1,12,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,ID_68de51c94,0.0,0,8,0,1,1,0.0,1,11,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,ID_d671db89c,180000.0,0,5,0,1,1,1.0,4,9,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1.0,4,11,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## 3 | Train-test split

Collapse the dataset into the household-level and create training, validation, and test sets to train models.

In [180]:
# Collapse data to household-level 
cr_df = clean.collapse_df(cr_df)
cr_df = clean.drop_indiv_vars(cr_df)

In [181]:
# Conduct split
X_train, y_train, X_val, y_val, X_test, y_test = cr_pmt_split(cr_df, seed = 42)

Training set prior to SMOTE: 2390
Training set size after SMOTE: 5463
Validation set size after SMOTE: 781
Test set size: 598


In [182]:
# Conduct split for CV
X_train, y_train, X_test, y_test = cr_pmt_split(cr_df, cv = True, seed = 42)

Training set size prior to SMOTE (prior to CV): 2390
Training set size after SMOTE (prior to CV): 6244
Test set size: 598


## 4 | Model tuning

We tune 3 sets of models and store the various results in a list.

In [183]:
# Tune 3 sets of models
rf = tuning.tune_sklearn_models(X_train, y_train, 'RandomForestClassifier')

In [184]:
lm = tuning.tune_sklearn_models(X_train, y_train, 'LogisticRegression')



In [185]:
knn = tuning.tune_sklearn_models(X_train, y_train, 'KNeighborsClassifier')

In [186]:
# Storing MR trained CV values and then the best parameters as comments
print(rf)
print(rf[0][rf[2]]) # {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.75, 'min_samples_leaf': 1, 'n_estimators': 100} 
print(lm)
print(lm[0][lm[2]]) # {'C': 10, 'penalty': 'l1'}
print(knn)
print(knn[0][knn[2]]) # {'leaf_size': 3} MR is skeptical this isn't overfit

([{'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 1, 'n_estimators': 25}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 1, 'n_estimators': 50}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 1, 'n_estimators': 100}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 5, 'n_estimators': 25}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 5, 'n_estimators': 50}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 5, 'n_estimators': 100}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 10, 'n_estimators': 25}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 10, 'n_estimators': 50}, {'criterion': 'gini', 'max_depth': None, 'max_samples': 0.25, 'min_samples_leaf': 10, 'n_estimators': 100}, {'criterion': 'gini', 'max_depth': Non