# Costa Rican Household Poverty Level Prediction
*From Kaggle ([competition link](https://www.kaggle.com/c/costa-rican-household-poverty-prediction))*
  
**By Nema Sobhani & David LaCharite**

## Summary

Income qualification for poor families in Costa Rica to determing need for aid. Data gathered from the *Inter-American Development Bank.*




## Imports




In [15]:
# General tools
import pandas as pd
import numpy as np

# Functions
from functions import *

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
from pprint import pprint
import io

# Classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel

# Hyperparameter Tuning (Random Forest)

In [16]:
#Google COLAB file import (TRAIN)
# from google.colab import files
# trainUpload = files.upload()

In [17]:
#Google COLAB file import (TEST)
# from google.colab import files
# testUpload = files.upload()

# Classification

In [18]:
# Load dataframe 
# df = dataframe_generator("train.csv")
# df = dataframe_generator_rent("train.csv") # RENT PREDICTIONS
df = dataframe_generator_trans("train.csv") # RENT PREDICTIONS / TRANSFORMATIONS

# Partition explanatory and response variables
X = df.drop(columns=['Target', 'Id', 'idhogar'])
print(X.shape)

y = df['Target']
print(y.shape)



(9557, 141)
(9557,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_rent_nan['v2a1'] = pd.DataFrame(rent_pred).values


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7645, 141), (1912, 141), (7645,), (1912,))

## Random Forest Classifier

In [20]:
clf_RF = RandomForestClassifier(n_estimators=10)

In [21]:
clf_RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
y_pred = clf_RF.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Precision
print("Precision:", precision_score(y_test, y_pred, average='macro'))

# Recall
print("Recall:", recall_score(y_test, y_pred, average='macro'))

# f1 Scores are intended for multiclassification models, not regression, but we were curious to see what would happen
f1_RF = f1_score(y_test, y_pred, average='macro')
print("f1 Score:", f1_RF)

Accuracy: 0.9063807531380753
Precision: 0.8889597821074937
Recall: 0.8301335441755888
f1 Score: 0.8564441758438537


## Random Search Training

In [23]:
pprint(clf_RF.get_params())

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [24]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [25]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [26]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [27]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf_RF, 
                               param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [28]:
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  9.0min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [29]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1400}

In [30]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 0.1281 degrees.
Accuracy = 92.75%.


In [31]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 0.0722 degrees.
Accuracy = 95.95%.


In [33]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 3.44%.


In [35]:
# f1 Score
y_pred = best_random.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Precision
print("Precision:", precision_score(y_test, y_pred, average='macro'))

# Recall
print("Recall:", recall_score(y_test, y_pred, average='macro'))

# f1 Scores are intended for multiclassification models, not regression, but we were curious to see what would happen
f1_RF = f1_score(y_test, y_pred, average='macro')
print("f1 Score:", f1_RF)

Accuracy: 0.9476987447698745
Precision: 0.9323417270485088
Recall: 0.9008427281283008
f1 Score: 0.9156526743614666


## Summary

Rent Predicted? | Tranformed? | Classifier | Base Accuracy (MAPE) |  Best Accuracy (MAPE) | f1 Score
--- | --- | --- | --- | --- | ---
NO | NO | RF | 91.00% | 94.24% | 0.908
YES | NO | RF | 89.34% | 94.87% | 0.921
YES | YES | RF | 92.66% | 95.89% | 0.920

# Grid Search Training

In [36]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search = GridSearchCV(estimator = clf_RF, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [37]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  3.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [38]:
best_grid_search = grid_search.best_estimator_
best_grid_search

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [39]:
# f1 Score
y_pred = best_grid_search.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Precision
print("Precision:", precision_score(y_test, y_pred, average='macro'))

# Recall
print("Recall:", recall_score(y_test, y_pred, average='macro'))

# f1 Scores are intended for multiclassification models, not regression, but we were curious to see what would happen
f1_RF = f1_score(y_test, y_pred, average='macro')
print("f1 Score:", f1_RF)

Accuracy: 0.7960251046025104
Precision: 0.9050425130926429
Recall: 0.5806230533343146
f1 Score: 0.6616467412507062


In [40]:
feature_importances = pd.DataFrame(grid_search.best_estimator_.feature_importances_,
                                   index = X_test.columns,
                                   columns=['importance']).sort_values('importance', 
                                   ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
v2a1,0.038572
SQ_v2a1,0.035548
meaneduc,0.033986
LOG_meaneduc,0.033359
SQBmeaned,0.03302
SQBdependency,0.027008
LOG_SQBdependency,0.024716
hogar_nin,0.022378
SQBhogar_nin,0.021752
SQBovercrowding,0.021521


# Test on Kaggle Sample Test Data

In [41]:
# Google Colab Approach
# kaggle_test_df = pd.read_csv(io.BytesIO(testUpload['test.csv']))

# kaggle_test_df = dataframe_generator("test.csv")
# kaggle_test_df = dataframe_generator_rent("test.csv") # RENT PREDICTIONS
kaggle_test_df = dataframe_generator_trans("test.csv") # RENT PREDICTIONS / TRANSFORMATIONS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_rent_nan['v2a1'] = pd.DataFrame(rent_pred).values


In [42]:
kaggle_test_df.shape

(23856, 143)

In [43]:
kaggle_test_df.head()

Unnamed: 0,Id,hacdor,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,r4h3,r4m1,r4m2,r4m3,r4t1,r4t2,r4t3,tamhog,tamviv,escolari,hhsize,paredblolad,paredzocalo,paredpreb,pareddes,paredmad,paredzinc,paredfibras,paredother,pisomoscer,pisocemento,pisoother,pisonatur,pisonotiene,pisomadera,techozinc,techoentrepiso,techocane,techootro,cielorazo,abastaguadentro,abastaguafuera,abastaguano,public,planpri,noelec,coopele,sanitario1,sanitario2,sanitario3,sanitario5,sanitario6,energcocinar1,energcocinar2,energcocinar3,energcocinar4,elimbasu1,elimbasu2,elimbasu3,elimbasu4,elimbasu5,elimbasu6,epared1,epared2,epared3,etecho1,etecho2,etecho3,eviv1,eviv2,eviv3,dis,male,female,estadocivil1,estadocivil2,estadocivil3,estadocivil4,estadocivil5,estadocivil6,estadocivil7,parentesco1,parentesco2,parentesco3,parentesco4,parentesco5,parentesco6,parentesco7,parentesco8,parentesco9,parentesco10,parentesco11,parentesco12,idhogar,hogar_nin,hogar_adul,hogar_mayor,hogar_total,instlevel1,instlevel2,instlevel3,instlevel4,instlevel5,instlevel6,instlevel7,instlevel8,instlevel9,bedrooms,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,computer,television,mobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,lugar6,area1,area2,SQBescolari,SQBage,SQBhogar_total,SQBhogar_nin,SQBovercrowding,SQBmeaned,agesq,SQ_SQBedjefe,LOG_qmobilephone,SQ_v2a1,SQBdependency,SQBedjefe,meaneduc,qmobilephone,rooms,LOG_meaneduc,SQ_qmobilephone,v2a1,SQ_overcrowding,LOG_SQBdependency
0,ID_2f6873615,0,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,0,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,1,0,0,0,0,0,0,0,0,2,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,16,9,1,2.25,272.25,16,0,0.693147,55533180000.0,0.25,0,16.5,2,5,2.80336,4,235654.8,2.25,-1.386294
1,ID_1c78846d2,0,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,16,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,0,0,0,0,0,0,0,1,0,2,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,256,1681,9,1,2.25,272.25,1681,0,0.693147,45199100000.0,0.25,0,16.5,2,5,2.80336,4,212600.8,2.25,-1.386294
2,ID_e5442cf6a,0,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,17,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,289,1681,9,1,2.25,272.25,1681,0,0.693147,45625300000.0,0.25,0,16.5,2,5,2.80336,4,213600.8,2.25,-1.386294
3,ID_a8db26a79,0,0,1,1,1,1.0,0,1,1,0,0,0,0,1,1,1,1,16,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5b598fbc9,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,256,3481,1,0,1.0,256.0,3481,65536,0.693147,2591476000000.0,0.0,256,16.0,2,14,2.772589,4,1609806.3,1.0,0.0
4,ID_a62966799,0,0,1,1,1,1.0,0,0,0,0,1,1,0,1,1,1,1,11,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1e2fc704e,1,0,0,1,0,0,0,0,1,0,0,0,0,2,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,121,324,1,1,0.25,121.0,324,0,0.0,30625000000.0,64.0,0,11.0,1,4,2.397895,1,175000.0,0.25,4.158883


In [44]:
# Partition explanatory and response variables
Id = kaggle_test_df.Id
X_kaggle_test = kaggle_test_df.drop(columns=['Id','idhogar'])

In [45]:
Target = grid_search.predict(X_kaggle_test)

In [46]:
Id = pd.Series(Id) 

In [47]:
Target = pd.Series(Target)
Target.head(10)

0    4
1    4
2    4
3    4
4    4
5    4
6    4
7    4
8    4
9    4
dtype: int64

In [48]:
kaggle_submit = pd.concat([Id, Target], axis=1)

In [49]:
kaggle_submit.columns = ['Id', 'Target']

In [59]:
kaggle_submit.head(10)

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4
5,ID_e77d38d45,4
6,ID_3c5f4bd51,4
7,ID_a849c29bd,4
8,ID_472fa82da,4
9,ID_24864adcc,4
