# Costa Rican Household Poverty Level Prediction
*From Kaggle ([competition link](https://www.kaggle.com/c/costa-rican-household-poverty-prediction))*
  
**By Nema Sobhani & David LaCharite**

## Summary

Income qualification for poor families in Costa Rica to determing need for aid. Data gathered from the *Inter-American Development Bank.*

## Imports

In [45]:
# General tools
import pandas as pd
import numpy as np

# Functions
from functions import *

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
from pprint import pprint

# Classification
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel

# Feature Engineering

## Rent Prediction

We decided to use regression models to predict **rent** as an approach to filling missing values to increase our power in predicting **poverty level**.  


After testing with tree-style classifiers (Random Forrest, XGBoost) and linear models (Linear Regression, RidgeCV, LassoCV, ElasticNetCV), we found that **Random Forest Regression** gave the highest scores in predicting rent.

### DataFrame Setup

In [46]:
# Setting up new dataframe (including rent data)
df_rent = dataframe_generator('train.csv', rent=True)

In [47]:
print("Missing values of explanatory variables:", df_rent.drop(columns='v2a1').isna().sum().sum())
print("Missing values of target variable (rent):", df_rent.v2a1.isna().sum())

Missing values of explanatory variables: 0
Missing values of target variable (rent): 6860


### Classification Setup

In [48]:
# Rent Prediction Function
def dataframe_generator_rent(data):
    
    #_______________________________
    # DATAFRAME SETUP
    #_______________________________
    
    # Setting up new dataframe (including rent data)
    df_rent = dataframe_generator(data, rent=True)
    
    # Remove missing values for target (rent)
    df_rent_predict = df_rent.dropna()

    
    #_______________________________
    # CLASSIFICATION SETUP
    #_______________________________
    
    # Partition explanatory and response variables
    X = df_rent_predict.drop(columns=['v2a1', 'Id', 'idhogar'])
    y = df_rent_predict['v2a1']

    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)
    
    
    #_______________________________
    # CLASSIFICATION 
    # (using random forest because it consistently gave highest score)
    #_______________________________
    
    # XGB
    # clf = xgb.XGBClassifier(max_depth=6,n_estimators=100, n_jobs=-1, subsample=.7)
    # clf.fit(X_train, y_train)
    # print(clf.score(X_test, y_test))
    
    # Random Forest
    clf = RandomForestRegressor()
    clf.fit(X_train, y_train)
    # print(clf.score(X_test, y_test))
    
    
    #_______________________________
    # FILL NAN USING PREDICTED VALUES FROM MODEL
    #_______________________________
    
    # Prepare data to fill in predicted values for rent
    df_rent_nan = df_rent[df_rent.v2a1.isna()]
    
    # Predict using model
    rent_pred = clf.predict(df_rent_nan.drop(columns=['v2a1', 'Id', 'idhogar']))
    
    # Fill NaN
    df_rent_nan['v2a1'] = pd.DataFrame(rent_pred).values
    
    # Update full dataframe
    df_rent[df_rent.v2a1.isna()] = df_rent_nan
    
    
    return df_rent

In [49]:
df_rent = dataframe_generator_rent('train.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
df_rent.to_pickle("df_rent.pkl")

# Transformations

We will take random subsets of the top 8 variables (determined from classification on full model without rent), along with their square and log transformations and see if any result in better scoring. 

In [51]:
# Setting up list of variable names including untransformed, squares, and logs
top_features = ['v2a1', 'meaneduc', 'SQBedjefe', 'overcrowding', 'SQBdependency', 'age', 'rooms', 'qmobilephone']
top_features_SQ = ["SQ_" + i for i in top_features]
top_features_LOG = ["LOG_" + i for i in top_features]

top_features_master = top_features + top_features_SQ + top_features_LOG

In [52]:
# Taking random subsets of varying sizes and running classifier to determine best subset
import random
random.seed(12345)

best_subset = None

# Iterate to find best feature subset and subset size
print("Currently running best_subset simulations of sample size:")
for size in range(8, 25):
    print(size)
    for run in range(10): # Scale up to 100, 1000, 10000, etc depending on computational power
        
        # Make copy of dataframe and remove original top 8 features
        df_test = df_rent.copy(deep=True)
        df_test.drop(columns=top_features, inplace=True)
        
        # Randomly sample subset without replacement
        subset = random.sample(top_features_master, size)
        
        # Add columns to dataframe
        for feature in subset:
            if "SQ_" in feature:
                col = feature.split("SQ_")[1]
                df_test[col] = df_rent[col] ** 2
                
            elif "LOG_" in feature:
                col = feature.split("LOG_")[1]
                df_test[col] = df_rent[col].apply(lambda x: np.log(x) if x!=0 else x)
                
            else:
                col = feature
                df_test[col] = df_rent[col]
            
        # Run model in Random Forest
        X = df_test.drop(columns=['Target', 'Id', 'idhogar'])
        y = df_test['Target']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=12345)
        
        clf_RF = RandomForestClassifier(n_estimators=10)
        clf_RF.fit(X_train, y_train)
        
        score = clf_RF.score(X_test, y_test)
        y_pred = clf_RF.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        if best_subset:
            if f1 > best_subset[3]:
                best_subset = [subset, size, score, f1]
        else:
            best_subset = [subset, size, score, f1]
        
best_subset

Currently running best_subset simulations of sample size:
8
9
10
11
12
13
14
15
16
17
18
19
20


[['SQBedjefe',
  'SQ_SQBedjefe',
  'LOG_meaneduc',
  'v2a1',
  'LOG_overcrowding',
  'LOG_v2a1',
  'LOG_rooms',
  'LOG_SQBdependency',
  'rooms',
  'SQ_qmobilephone',
  'SQ_v2a1',
  'LOG_age',
  'meaneduc',
  'age',
  'SQ_age',
  'SQBdependency',
  'SQ_SQBdependency'],
 17,
 0.9100418410041841,
 0.8665894529811344]

In [53]:
# # TOO COMPUTATIONALLY EXPENSIVE
# # Trying all combinations of varying sizes and running classifier to determine best subset
# from itertools import combinations

# best_subset = None

# # Iterate to find best feature subset and subset size
# print("Currently running best_subset simulations of sample size:")

# for size in range(8, 21):
    
# #     print(size)
    
#     # Combinations to try
#     comb = list(combinations(top_features_master, size))
    
#     for run in range(len(comb)):
    
#         # Status Update
#         if len(comb) % (run+1) == 0:
#             print(f'Size {size}: {(run+1) / len(comb) * 100:.5f}%')
            
#         # Subset
#         subset = comb[run]
        
#         # Make copy of dataframe and remove original top 8 features
#         df_test = df_rent.copy(deep=True)
#         df_test.drop(columns=top_features, inplace=True)
        
#         # Add columns to dataframe
#         for feature in subset:
#             if "SQ_" in feature:
#                 col = feature.split("SQ_")[1]
#                 df_test[col] = df_rent[col] ** 2
                
#             elif "LOG_" in feature:
#                 col = feature.split("LOG_")[1]
#                 df_test[col] = df_rent[col].apply(lambda x: np.log(x) if x!=0 else x)
                
#             else:
#                 col = feature
#                 df_test[col] = df_rent[col]
            
#         # Run model in Random Forest
#         X = df_test.drop(columns='Target')
#         y = df_test['Target']
        
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=12345)
        
#         clf_RF = RandomForestClassifier(n_estimators=10)
#         clf_RF.fit(X_train, y_train)
        
#         score = clf_RF.score(X_test, y_test)
#         y_pred = clf_RF.predict(X_test)
#         f1 = f1_score(y_test, y_pred, average='macro')
        
#         if best_subset:
#             if f1 > best_subset[3]:
#                 best_subset = [subset, size, score, f1]
#         else:
#             best_subset = [subset, size, score, f1]
        
# best_subset

## Create New DataFrame for Classification

In [54]:
# Transformed DataFrame Generator
def dataframe_generator_trans(data):
    
    # Top Features
    top_features = ['v2a1', 'meaneduc', 'SQBedjefe', 'overcrowding', 'SQBdependency', 'age', 'rooms', 'qmobilephone']

    # Best subset
    winner = \
            [['SQ_SQBedjefe',
              'LOG_qmobilephone',
              'SQ_v2a1',
              'SQBdependency',
              'SQBedjefe',
              'meaneduc',
              'qmobilephone',
              'rooms',
              'LOG_meaneduc',
              'SQ_qmobilephone',
              'v2a1',
              'SQ_overcrowding',
              'LOG_SQBdependency'],
             13,
             0.9257322175732218,
             0.8887133182436542]
            
    # Create rent-inclusive dataframe
    df_rent = dataframe_generator_rent(data)
    
    # Create transformed dataframe
    df_trans = df_rent.copy(deep=True)
    df_trans.drop(columns=top_features, inplace=True)

    for feature in winner[0]:
        if "SQ_" in feature:
            col = feature.split("SQ_")[1]
            df_trans[feature] = df_rent[col] ** 2

        elif "LOG_" in feature:
            col = feature.split("LOG_")[1]
            df_trans[feature] = df_rent[col].apply(lambda x: np.log(x) if x!=0 else x)

        else:
            col = feature
            df_trans[feature] = df_rent[col]
            
    return df_trans

In [55]:
df_trans = dataframe_generator_trans("train.csv")
df_trans.to_pickle("df_trans.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
