## Import

In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV


In [7]:
# data directory
if os.path.exists("C:\\Users\\elieg\\Dropbox\\World Bank DS competition") == True:
    #Elliot
    computer_dir = "C:\\Users\\elieg\\Dropbox\\World Bank DS competition"
elif os.path.exists("C:\\Users\\elieg\\Dropbox\\World Bank DS competition") == True:
    #Elie
    computer_dir = "C:\\Users\\elieg\\Dropbox\\World Bank DS competition"
elif os.path.exists("/Users/manueltonneau/Dropbox/World_Bank_DS_competition/") == True:
    #Manuel
    computer_dir = "/Users/manueltonneau/Dropbox/World_Bank_DS_competition/"

DATA_DIR = os.path.join(computer_dir,"data")
data_paths_indiv = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'A', 'A_indiv_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'B', 'B_indiv_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR, 'C', 'C_indiv_train.csv'), 
                    'test':  os.path.join(DATA_DIR, 'C', 'C_indiv_test.csv')}}

In [8]:
# load individual-level data
indiv_train = {}
indiv_test = {}
for letter in ["A", "B", "C"]:
    indiv_train[letter] = pd.read_csv(data_paths_indiv[letter]['train'], index_col='id')
    indiv_test[letter] = pd.read_csv(data_paths_indiv[letter]['test'], index_col='id')

In [9]:
indiv_train["A"].head()

Unnamed: 0_level_0,iid,HeUgMnzF,CaukPfUC,MzEtIdUF,gtnNTNam,SWoXNmPc,eXbOkwhI,OdXpbPGJ,XONDGWjH,KsFoQcUV,...,ukWqmeSS,qqVibbSA,MgCoFhXK,rFpoTXAq,RXcLsVAQ,rQWIpTiG,XizJGmbu,xqUooaNJ,poor,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80389,1,XJsPz,mOlYV,UFoKR,SSvEP,onRNG,YXCNt,4.0,oArAw,kpkiH,...,181,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,dSJoN,True,A
80389,2,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,4.0,ccbZA,HgfUG,...,141,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,True,A
80389,3,TRFeI,mOlYV,axSTs,CXizI,NDnCs,YXCNt,4.0,fOUHD,HgfUG,...,41,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,True,A
80389,4,XJsPz,yAyAe,FRcdT,CXizI,onRNG,YXCNt,,fOUHD,HgfUG,...,16,QQdHS,uEstx,Hikoa,zQvdC,jVHyH,GtHel,JTCKs,True,A
39883,1,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,4.0,oArAw,kpkiH,...,381,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,UaIsy,False,A


## Target analysis

In [31]:
#A
target_A = indiv_train["A"]["poor"]
a = pd.get_dummies(target_A)
a.reset_index(level=0, inplace=True)
b = a.groupby(["id"])[True].mean()
df = pd.DataFrame(data=b)
df.loc[(df[True] >0 ) & df[True]<0]


Unnamed: 0_level_0,True
id,Unnamed: 1_level_1


In [32]:
#B
target_B = indiv_train["B"]["poor"]
a = pd.get_dummies(target_B)
a.reset_index(level=0, inplace=True)
b = a.groupby(["id"])[True].mean()
df = pd.DataFrame(data=b)
df.loc[(df[True] >0 ) & df[True]<0]

Unnamed: 0_level_0,True
id,Unnamed: 1_level_1


In [33]:
#C
target_C = indiv_train["C"]["poor"]
a = pd.get_dummies(target_C)
a.reset_index(level=0, inplace=True)
b = a.groupby(["id"])[True].mean()
df = pd.DataFrame(data=b)
df.loc[(df[True] >0 ) & df[True]<0]

Unnamed: 0_level_0,True
id,Unnamed: 1_level_1


The conclusion is that, if a household is poor, then all of its members are labelled as "poor".

## Pre-processing

In [43]:
# Standardize features
def standardize(df, letter, train_or_test, numeric_only=True):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    numeric_cols = df[numeric.columns]
    
    # replace NANs by training sample mean values and store them for the test
    if train_or_test == "train":
        meanvalues = pd.DataFrame(numeric_cols.mean()).transpose()
        meanvalues.to_csv(DATA_DIR+"\\"+"meanvalues_"+str(letter))
    meanvalues = pd.read_csv(DATA_DIR+"\\"+"meanvalues_"+str(letter), index_col = "Unnamed: 0")
    for col in numeric_cols.columns:
        numeric_cols[col] = numeric_cols[col].fillna(float(meanvalues[col].values))
    
    # use standard scaling with training sample parameters
    scaler = StandardScaler()
    if train_or_test == "train":
        scaler.fit(numeric_cols)
        joblib.dump(scaler, DATA_DIR+"\\"+"scaler_"+str(letter))
    scaler = joblib.load(DATA_DIR+"\\"+"scaler_"+str(letter))
    numeric_cols = pd.DataFrame(scaler.transform(numeric_cols))
   
    # Former version from competition website: subtracy mean and divide by std
    #df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()

    return df, numeric.columns
    

def pre_process_data(df, letter, train_or_test, enforce_cols=None):
    print(train_or_test)
    print("Input shape:\t{}".format(df.shape))
        

    df, num_cols = standardize(df, letter, train_or_test)
    print("After standardization {}".format(df.shape))
        
    # create dummy variables for categoricals
    df = pd.get_dummies(df)
    print("----------> After converting categoricals:\t{}".format(df.shape))
    
    # match test set and training set columns
    # here a column is a couple variable-mode
    # if a mode is not present in the test set, a column will be missing -> it needs to be added (and filled with 0)
    if enforce_cols is not None:
        enforce_cols.remove("id")
        enforce_cols.remove("iid")
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)
        #print("To drop : ", to_drop)
        #print("To add : ", to_add)

        df = df.drop(to_drop, axis=1)
        df = df.assign(**{c: 0 for c in to_add})
        print("----------> After adapting columns number:\t{}".format(df.shape))
    
    df.fillna(0, inplace=True)
    
    if train_or_test == "train":
        return df, num_cols
    else:
        return df

In [54]:
indiv_train_proc = {}
numeric_cols_train = {}
indiv_test_proc = {}
for letter in ["A", "B", "C"]:
    #pre-process everything but "iid" and "id" column, then add them again
    print("Train")
    indiv_train_proc[letter], numeric_cols_train[letter] = pre_process_data\
    (indiv_train[letter].loc[:, indiv_train[letter].columns != "iid"].drop('poor', axis=1), letter, "train")
    indiv_train_proc[letter]["iid"] = indiv_train[letter]["iid"]
    indiv_train_proc[letter]["id"] = indiv_train[letter].index
    indiv_train_proc[letter].index = indiv_train_proc[letter].index.rename("index")
    print("Test")
    indiv_test_proc[letter] = pre_process_data\
    (indiv_test[letter].loc[:, indiv_test[letter].columns != "iid"], letter, "test",\
     enforce_cols = list(indiv_train_proc[letter].columns)) # enforce_cols is needed to adapt columns to the training set
    indiv_test_proc[letter]["iid"] = indiv_test[letter]["iid"]
    indiv_test_proc[letter]["id"] = indiv_test[letter].index
    indiv_test_proc[letter].index = indiv_test_proc[letter].index.rename("index")

Train
A
Input shape:	(37560, 41)
After standardization (37560, 41)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


----------> After converting categoricals:	(37560, 274)
Test
A
Input shape:	(18535, 41)
After standardization (18535, 41)
----------> After converting categoricals:	(18535, 273)
----------> After adapting columns number:	(18535, 274)
Train
B
Input shape:	(20252, 224)
After standardization (20252, 224)
----------> After converting categoricals:	(20252, 1641)
Test
B
Input shape:	(10066, 224)
After standardization (10066, 224)
----------> After converting categoricals:	(10066, 1534)
----------> After adapting columns number:	(10066, 1641)
Train
C
Input shape:	(29913, 41)
After standardization (29913, 41)
----------> After converting categoricals:	(29913, 300)
Test
C
Input shape:	(14701, 41)
After standardization (14701, 41)
----------> After converting categoricals:	(14701, 301)
----------> After adapting columns number:	(14701, 300)


### Feature Engineering

Number of household members variable

In [55]:
hh_train_new = {}
hh_test_new = {}
for letter in ["A", "B", "C"]:
    hh_train_new[letter] = pd.DataFrame(data=indiv_train_proc[letter].groupby(['id'], sort=False)['iid'].max())
    hh_train_new[letter].columns = ["nb_members_hh"]
    hh_test_new[letter] = pd.DataFrame(data=indiv_test_proc[letter].groupby(['id'], sort=False)['iid'].max())
    hh_test_new[letter].columns = ["nb_members_hh"]

In [46]:
print(numeric_cols_train["A"])
indiv_train_proc["C"].head()

Index(['OdXpbPGJ', 'ukWqmeSS'], dtype='object')


Unnamed: 0_level_0,XKQWlRjk,vWNISgEA,bsMfXBld,XKyOwsRR,CgAkQtOd,OoqEwyJF_RuCZA,OoqEwyJF_cEcbt,cJPCnaAs_BZKME,cJPCnaAs_DMxNA,cJPCnaAs_Eadzw,...,rVneGwzn_UXHpZ,rVneGwzn_ldKFc,rVneGwzn_xgpHA,uVFOfrpa_DnIbO,uVFOfrpa_kXobL,uVFOfrpa_oacjJ,uVFOfrpa_xRxWC,country_C,iid,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30639,122,9,9.0,-3,-7.0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,1,30639
30639,68,9,9.0,-3,-7.0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,2,30639
30639,1,9,9.0,-3,-7.0,0,1,0,0,0,...,0,1,0,0,1,0,0,1,3,30639
30639,1,104,-19.2,249,-53.8699,0,1,0,0,0,...,0,0,0,0,0,0,1,1,9,30639
30639,1,113,-27.0,237,-56.926,0,1,0,0,0,...,0,0,0,0,0,0,1,1,10,30639


In [56]:
#creation of sum of individual-level variable per household
# relevant for dummies and for numeric variables
for letter in ["A", "B", "C"]:
    for i in indiv_train_proc[letter].columns.tolist()[:-3]:
        hh_train_new[letter][i+"_sum"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].sum()
    for i in indiv_test_proc[letter].columns.tolist()[:-3]:
        hh_test_new[letter][i+"_sum"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].sum()
        

In [58]:
# quantitative variables
# for them we can try other ways of summarizing the information: minimum, maximum, mean, median
for letter in ["A", "B", "C"]:
    print(len(numeric_cols_train[letter]))
    for i in numeric_cols_train[letter].tolist():
        hh_train_new[letter][i+"_mean"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].mean()
        hh_train_new[letter][i+"_min"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].min()
        hh_train_new[letter][i+"_max"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].max()
        hh_train_new[letter][i+"_median"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].quantile(q = 0.5)
        hh_train_new[letter][i+"_std"] = indiv_train_proc[letter].groupby(["id"],sort=False)[i].std()
    for i in numeric_cols_train[letter].tolist():
        hh_test_new[letter][i+"_mean"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].mean()
        hh_test_new[letter][i+"_min"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].min()
        hh_test_new[letter][i+"_max"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].max()
        hh_test_new[letter][i+"_median"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].quantile(q = 0.5)
        hh_test_new[letter][i+"_std"] = indiv_test_proc[letter].groupby(["id"],sort=False)[i].std()


2
32
5


In [62]:
hh_test_new["A"].head()

Unnamed: 0_level_0,nb_members_hh,OdXpbPGJ_sum,ukWqmeSS_sum,HeUgMnzF_BNCcM_sum,HeUgMnzF_HUpWg_sum,HeUgMnzF_PAVsH_sum,HeUgMnzF_SJPkb_sum,HeUgMnzF_SlRmt_sum,HeUgMnzF_TRFeI_sum,HeUgMnzF_XJgvq_sum,...,OdXpbPGJ_mean,OdXpbPGJ_min,OdXpbPGJ_max,OdXpbPGJ_median,ukWqmeSS_mean,ukWqmeSS_min,ukWqmeSS_max,ukWqmeSS_median,OdXpbPGJ_std,ukWqmeSS_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9370,5,16.0,310,0,0,0,0,1,0,0,...,3.2,0.0,4.0,4.0,62.0,11,141,46.0,1.788854,49.924944
88630,1,4.0,141,0,0,0,0,0,0,0,...,4.0,4.0,4.0,4.0,141.0,141,141,141.0,,
42512,6,24.0,931,0,0,0,0,0,0,0,...,4.0,4.0,4.0,4.0,155.166667,26,356,93.5,0.0,151.37426
62674,4,12.0,334,0,0,0,0,1,0,0,...,3.0,0.0,4.0,4.0,83.5,6,171,78.5,2.0,77.942286
89299,4,16.0,559,0,0,0,0,0,0,0,...,4.0,4.0,4.0,4.0,139.75,61,241,128.5,0.0,92.769877


In [35]:
indiv_train["A"].head()

Unnamed: 0_level_0,iid,HeUgMnzF,CaukPfUC,MzEtIdUF,gtnNTNam,SWoXNmPc,eXbOkwhI,OdXpbPGJ,XONDGWjH,KsFoQcUV,...,ukWqmeSS,qqVibbSA,MgCoFhXK,rFpoTXAq,RXcLsVAQ,rQWIpTiG,XizJGmbu,xqUooaNJ,poor,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80389,1,XJsPz,mOlYV,UFoKR,SSvEP,onRNG,YXCNt,4.0,oArAw,kpkiH,...,181,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,dSJoN,True,A
80389,2,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,4.0,ccbZA,HgfUG,...,141,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,True,A
80389,3,TRFeI,mOlYV,axSTs,CXizI,NDnCs,YXCNt,4.0,fOUHD,HgfUG,...,41,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,True,A
80389,4,XJsPz,yAyAe,FRcdT,CXizI,onRNG,YXCNt,,fOUHD,HgfUG,...,16,QQdHS,uEstx,Hikoa,zQvdC,jVHyH,GtHel,JTCKs,True,A
39883,1,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,4.0,oArAw,kpkiH,...,381,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,UaIsy,False,A


In [18]:
a_hh_train_new.head()

Unnamed: 0_level_0,nb_members_hh,OdXpbPGJ_sum,ukWqmeSS_sum,HeUgMnzF_BNCcM_sum,HeUgMnzF_HUpWg_sum,HeUgMnzF_JMXQx_sum,HeUgMnzF_PAVsH_sum,HeUgMnzF_SJPkb_sum,HeUgMnzF_SlRmt_sum,HeUgMnzF_TRFeI_sum,...,rQWIpTiG_rkLqZ_sum,rQWIpTiG_xUYIC_sum,XizJGmbu_FUUXv_sum,XizJGmbu_GtHel_sum,XizJGmbu_juMSt_sum,xqUooaNJ_ALcKg_sum,xqUooaNJ_JTCKs_sum,xqUooaNJ_UaIsy_sum,xqUooaNJ_dSJoN_sum,xqUooaNJ_vhhVz_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80389,4,-0.671286,-0.534789,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,2.0,0.0,1.0,3.0,0.0,3.0,0.0,1.0,0.0
39883,3,-0.671286,1.927463,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,2.0,0.0,0.0,3.0,0.0,2.0,1.0,0.0,0.0
18327,8,-1.790096,0.019804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,2.0,0.0,6.0,0.0,3.0,0.0,5.0,0.0
88416,5,-0.895048,-0.273586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,1.0,1.0,3.0,0.0,4.0,1.0,0.0,0.0
74477,2,-0.447524,0.250062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0


### Data export

In [None]:
#/Users/manueltonneau/Dropbox/World_Bank_DS_competition/data

In [17]:
c_hh_train_new.head()

Unnamed: 0_level_0,nb_members_hh,XKQWlRjk_sum,vWNISgEA_sum,bsMfXBld_sum,XKyOwsRR_sum,CgAkQtOd_sum,OoqEwyJF_RuCZA_sum,OoqEwyJF_cEcbt_sum,cJPCnaAs_BZKME_sum,cJPCnaAs_DMxNA_sum,...,sCTSWhXf_dwXxj_sum,sCTSWhXf_yQhuJ_sum,rVneGwzn_QGHnL_sum,rVneGwzn_UXHpZ_sum,rVneGwzn_ldKFc_sum,rVneGwzn_xgpHA_sum,uVFOfrpa_DnIbO_sum,uVFOfrpa_kXobL_sum,uVFOfrpa_oacjJ_sum,uVFOfrpa_xRxWC_sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30639,12,3.768511,7.620859,-9.043022,-2.327783,-12.697007,1.0,6.0,0.0,0.0,...,0.0,7.0,4.0,0.0,1.0,2.0,2.0,1.0,0.0,4.0
45912,1,2.276249,-0.420516,0.40212,-0.710936,0.390934,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
42605,1,-0.009835,-0.420516,0.40212,-0.710936,0.390934,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
57416,8,-0.059009,0.639918,-0.622719,5.987685,0.294842,0.0,6.0,0.0,0.0,...,0.0,6.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,6.0
98400,2,1.531602,-0.841032,0.80424,-1.421872,0.781867,0.0,2.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0


In [59]:
for letter in ["A", "B", "C"]:
    hh_train_new[letter].to_csv(DATA_DIR+"\\"+str(letter)+"\\"+str(letter)+"_hhold_train_new2.csv")
    hh_test_new[letter].to_csv(DATA_DIR+"\\"+str(letter)+"\\"+str(letter)+"_hhold_test_new2.csv")