In [0]:
# General tools
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None

# Classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import io
from sklearn.metrics import f1_score
from pprint import pprint


In [0]:
#Google COLAB file import
from google.colab import files
trainUpload = files.upload()

Saving train.csv to train (3).csv


In [0]:
#Google COLAB file import
from google.colab import files
testUpload = files.upload()

Saving test.csv to test (4).csv


In [0]:
# Setting up new dataframe
df_rent = pd.read_csv(io.BytesIO(trainUpload['train.csv']))

# Clean Training Data

In [0]:
educ_by_household = df_rent[df_rent.age >= 18].groupby('idhogar')['escolari'].mean()

def missing_educ(x, SQ=False):
    
    if SQ == False:
        col = 'meaneduc'
    else:
        col = 'SQBmeaned'
        
    # Find missing value ids
    missing_ids = x[x[col].isna()][col].keys()
    print(missing_ids)
    
    # Iterate over missing values and set them to correct value
    
    for i in missing_ids:
        if SQ == False:
            household = x.loc[i, 'idhogar']
            try:
              x.loc[i, col] = educ_by_household[household]
            except:
              x.loc[i, col] = 0
              
        else:
            avg = x.loc[i, 'meaneduc']
            try:
              x.loc[i, col] = avg ** 2
            except:
              x.loc[i, col] = 0
    
    return x
        
df_rent = missing_educ(df_rent)
df_rent.meaneduc.isna().sum()

Int64Index([1291, 1840, 1841, 2049, 2050], dtype='int64')


0

In [0]:


# Handling missing values
df_rent.v18q1.fillna(0, inplace=True)
df_rent.drop("rez_esc", axis=1, inplace=True)
df_rent = missing_educ(df_rent)
df_rent = missing_educ(df_rent, SQ=True)



Int64Index([], dtype='int64')
Int64Index([1291, 1840, 1841, 2049, 2050], dtype='int64')


In [0]:
# Handling ambiguous columns
df_rent = df_rent.drop(columns=['edjefe', 'edjefa', 'dependency'])

print("Missing values of explanatory variables:", df_rent.drop(columns='v2a1').isna().sum().sum())
print("Missing values of target variable (rent):", df_rent.v2a1.isna().sum())

# Remove missing values for target (rent)
df_rent_predict = df_rent.dropna()
print(df_rent_predict.shape)

### Classification Setup

# Partition explanatory and response variables
X = df_rent_predict.drop(columns=['v2a1', 'Id', 'idhogar']) # Dropping identifiers and target
print(X.shape)

y = df_rent_predict['v2a1']
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)

### Classifiers

#### XGBoost

clf_xgb = xgb.XGBClassifier(max_depth=6,n_estimators=100, n_jobs=-1, subsample=.7)

clf_xgb.fit(X_train, y_train)

clf_xgb.score(X_test, y_test)

### Rent Prediction Results

#XGBoost gave the greatest accuracy in predicting rent values, so we will use it to fill missing rent values and apply our classifications again.

# Prepare data to fill in predicted values for rent
df_rent_nan = df_rent[df_rent.v2a1.isna()]

rent_pred = clf_xgb.predict(df_rent_nan.drop(columns=['v2a1', 'Id', 'idhogar']))

df_rent_nan['v2a1'] = pd.DataFrame(rent_pred).values

df_rent[df_rent.v2a1.isna()] = df_rent_nan

df_rent.isna().sum().sum()

Missing values of explanatory variables: 0
Missing values of target variable (rent): 6860
(2697, 139)
(2697, 136)
(2697,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0

In [0]:
df_rent

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,r4h3,r4m1,r4m2,r4m3,r4t1,r4t2,r4t3,tamhog,tamviv,escolari,hhsize,paredblolad,paredzocalo,paredpreb,pareddes,paredmad,paredzinc,paredfibras,paredother,pisomoscer,pisocemento,pisoother,pisonatur,pisonotiene,pisomadera,techozinc,techoentrepiso,techocane,techootro,cielorazo,abastaguadentro,abastaguafuera,abastaguano,public,planpri,noelec,coopele,sanitario1,sanitario2,sanitario3,sanitario5,sanitario6,energcocinar1,energcocinar2,energcocinar3,energcocinar4,elimbasu1,elimbasu2,elimbasu3,elimbasu4,elimbasu5,elimbasu6,epared1,epared2,epared3,etecho1,etecho2,etecho3,eviv1,eviv2,eviv3,dis,male,female,estadocivil1,estadocivil2,estadocivil3,estadocivil4,estadocivil5,estadocivil6,estadocivil7,parentesco1,parentesco2,parentesco3,parentesco4,parentesco5,parentesco6,parentesco7,parentesco8,parentesco9,parentesco10,parentesco11,parentesco12,idhogar,hogar_nin,hogar_adul,hogar_mayor,hogar_total,meaneduc,instlevel1,instlevel2,instlevel3,instlevel4,instlevel5,instlevel6,instlevel7,instlevel8,instlevel9,bedrooms,overcrowding,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,computer,television,mobilephone,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,lugar6,area1,area2,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,0.0,0,1,1,0,0,0,0,1,1,1,1,10,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,21eb7fcc1,0,1,0,1,10.000000,0,0,0,1,0,0,0,0,0,1,1.000000,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,43,100,1849,1,100,0,1.000000,0.000000,100.000000,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,1,1,0,0,0,0,1,1,1,1,12,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0e5d7a658,0,1,1,1,12.000000,0,0,0,0,0,0,0,1,0,1,1.000000,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,67,144,4489,1,144,0,1.000000,64.000000,144.000000,4489,4
2,ID_68de51c94,100000.0,0,8,0,1,1,0,0.0,0,0,0,0,1,1,0,1,1,1,1,11,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,2c7317ea8,0,1,1,1,11.000000,0,0,0,0,1,0,0,0,0,2,0.500000,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,92,121,8464,1,0,0,0.250000,64.000000,121.000000,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,2,2,1,1,2,1,3,4,4,4,9,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,2b58d945f,2,2,0,4,11.000000,0,0,0,1,0,0,0,0,0,3,1.333333,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,1,0,17,81,289,16,121,4,1.777778,1.000000,121.000000,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,2,2,1,1,2,1,3,4,4,4,11,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2b58d945f,2,2,0,4,11.000000,0,0,0,0,1,0,0,0,0,3,1.333333,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,1,0,37,121,1369,16,121,4,1.777778,1.000000,121.000000,1369,4
5,ID_ec05b1a7b,180000.0,0,5,0,1,1,1,1.0,0,2,2,1,1,2,1,3,4,4,4,11,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2b58d945f,2,2,0,4,11.000000,0,0,0,0,1,0,0,0,0,3,1.333333,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,1,0,38,121,1444,16,121,4,1.777778,1.000000,121.000000,1444,4
6,ID_e9e0c1100,180000.0,0,5,0,1,1,1,1.0,0,2,2,1,1,2,1,3,4,4,4,2,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2b58d945f,2,2,0,4,11.000000,0,1,0,0,0,0,0,0,0,3,1.333333,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,1,0,8,4,64,16,121,4,1.777778,1.000000,121.000000,64,4
7,ID_3e04e571e,130000.0,1,2,0,1,1,0,0.0,0,1,1,2,1,3,2,2,4,4,4,0,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,d6dae86b7,2,2,0,4,10.000000,1,0,0,0,0,0,0,0,0,1,4.000000,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,7,0,49,16,81,4,16.000000,1.000000,100.000000,49,4
8,ID_1284f8aad,130000.0,1,2,0,1,1,0,0.0,0,1,1,2,1,3,2,2,4,4,4,9,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,d6dae86b7,2,2,0,4,10.000000,0,0,0,1,0,0,0,0,0,1,4.000000,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,30,81,900,16,81,4,16.000000,1.000000,100.000000,900,4
9,ID_51f52fdd2,130000.0,1,2,0,1,1,0,0.0,0,1,1,2,1,3,2,2,4,4,4,11,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,d6dae86b7,2,2,0,4,10.000000,0,0,0,0,1,0,0,0,0,1,4.000000,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,28,121,784,16,81,4,16.000000,1.000000,100.000000,784,4


In [0]:
# Partition explanatory and response variables
y = df_rent['Target']
print(y.shape)
X = df_rent.drop(columns=['Target', 'Id', 'idhogar']) # Dropping identifiers and target
# X = pd.DataFrame(df.meaneduc)
print(X.shape)
# X_trim_train = X[['meaneduc', 
#             'SQBmeaned', 
#             'SQBdependency', 
#             'SQBedjefe', 
#             'overcrowding', 
#             'SQBovercrowding', 
#             'qmobilephone', 
#             'SQBhogar_nin', 
#             'hogar_nin', 
#             'rooms']]


(9557,)
(9557, 136)


In [0]:
#X_trim_train.head()

In [0]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train Best Random Model

In [0]:
#best_random = RandomForestClassifier(bootstrap=False, max_depth=50, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=1000)

# best_random = RandomForestClassifier({'bootstrap': False,
#  'max_depth': 50,
#  'max_features': 'auto',
#  'min_samples_leaf': 1,
#  'min_samples_split': 2,
#  'n_estimators': 1000})

In [0]:
#best_random.fit(X_trim_train,y)

# Train Best Gridsearch Model

In [0]:
best_grid_search = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
best_grid_search.fit(X, y)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Clean Test Data

In [0]:
# Setting up new dataframe
df_test = pd.read_csv(io.BytesIO(testUpload['test.csv']))

In [0]:
educ_by_household = df_test[df_test.age >= 18].groupby('idhogar')['escolari'].mean()

def missing_educ(x, SQ=False):
    
    if SQ == False:
        col = 'meaneduc'
    else:
        col = 'SQBmeaned'
        
    # Find missing value ids
    missing_ids = x[x[col].isna()][col].keys()
    print(missing_ids)
    
    # Iterate over missing values and set them to correct value
    
    for i in missing_ids:
        if SQ == False:
            household = x.loc[i, 'idhogar']
            try:
              x.loc[i, col] = educ_by_household[household]
            except:
              x.loc[i, col] = 0
              
        else:
            avg = x.loc[i, 'meaneduc']
            try:
              x.loc[i, col] = avg ** 2
            except:
              x.loc[i, col] = 0
 

    return x
        
df_test = missing_educ(df_test)
df_test.meaneduc.isna().sum()

Int64Index([    4,   535,   536,   537,  2612,  2613,  6809,  6810,  7266,
             7267, 15808, 15809, 15810, 16096, 16097, 16291, 19421, 19985,
            21136, 21137, 21644, 21645, 21824, 21825, 21826, 21827, 22125,
            22126, 22336, 22337, 22338],
           dtype='int64')


0

In [0]:
# Handling missing values
df_test.v18q1.fillna(0, inplace=True)
df_test.drop("rez_esc", axis=1, inplace=True)
df_test = missing_educ(df_test)
df_test = missing_educ(df_test, SQ=True)

Int64Index([], dtype='int64')
Int64Index([    4,   535,   536,   537,  2612,  2613,  6809,  6810,  7266,
             7267, 15808, 15809, 15810, 16096, 16097, 16291, 19421, 19985,
            21136, 21137, 21644, 21645, 21824, 21825, 21826, 21827, 22125,
            22126, 22336, 22337, 22338],
           dtype='int64')


In [0]:

# Handling ambiguous columns
df_test = df_test.drop(columns=['edjefe', 'edjefa', 'dependency'])

print("Missing values of explanatory variables:", df_test.drop(columns='v2a1').isna().sum().sum())
print("Missing values of target variable (rent):", df_test.v2a1.isna().sum())

# Remove missing values for target (rent)
df_test_predict = df_test.dropna()
print(df_test_predict.shape)

### Classification Setup

# Partition explanatory and response variables
X = df_test_predict.drop(columns=['v2a1', 'Id', 'idhogar']) # Dropping identifiers and target
print(X.shape)

y = df_test_predict['v2a1']
print(y.shape)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)

### Classifiers

#### XGBoost

#clf_xgb = xgb.XGBClassifier(max_depth=6,n_estimators=100, n_jobs=-1, subsample=.7)

#clf_xgb.fit(X_train, y_train)

#clf_xgb.score(X_test, y_test)

### Rent Prediction Results

#XGBoost gave the greatest accuracy in predicting rent values, so we will use it to fill missing rent values and apply our classifications again.

# Prepare data to fill in predicted values for rent
df_test_nan = df_test[df_test.v2a1.isna()]

test_pred = clf_xgb.predict(df_test_nan.drop(columns=['v2a1', 'Id', 'idhogar']))

df_test_nan['v2a1'] = pd.DataFrame(test_pred).values

df_test[df_test.v2a1.isna()] = df_test_nan

df_test.isna().sum().sum()

Missing values of explanatory variables: 0
Missing values of target variable (rent): 17403
(6453, 138)
(6453, 135)
(6453,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0

In [0]:
df_test = missing_educ(df_test, SQ=True)
df_test.SQBmeaned.isna().sum()

Int64Index([], dtype='int64')


0

In [0]:
print("Columns before:", df_test.shape[1])

print("Columns after:", df_test.shape[1])

Columns before: 138
Columns after: 138


In [0]:
df_test.shape

(23856, 138)

In [0]:
df_test.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,r4h3,r4m1,r4m2,r4m3,r4t1,r4t2,r4t3,tamhog,tamviv,escolari,hhsize,paredblolad,paredzocalo,paredpreb,pareddes,paredmad,paredzinc,paredfibras,paredother,pisomoscer,pisocemento,pisoother,pisonatur,pisonotiene,pisomadera,techozinc,techoentrepiso,techocane,techootro,cielorazo,abastaguadentro,abastaguafuera,abastaguano,public,planpri,noelec,coopele,sanitario1,sanitario2,sanitario3,sanitario5,sanitario6,energcocinar1,energcocinar2,energcocinar3,energcocinar4,elimbasu1,elimbasu2,elimbasu3,elimbasu4,elimbasu5,elimbasu6,epared1,epared2,epared3,etecho1,etecho2,etecho3,eviv1,eviv2,eviv3,dis,male,female,estadocivil1,estadocivil2,estadocivil3,estadocivil4,estadocivil5,estadocivil6,estadocivil7,parentesco1,parentesco2,parentesco3,parentesco4,parentesco5,parentesco6,parentesco7,parentesco8,parentesco9,parentesco10,parentesco11,parentesco12,idhogar,hogar_nin,hogar_adul,hogar_mayor,hogar_total,meaneduc,instlevel1,instlevel2,instlevel3,instlevel4,instlevel5,instlevel6,instlevel7,instlevel8,instlevel9,bedrooms,overcrowding,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,computer,television,mobilephone,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,lugar6,area1,area2,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,140000.0,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,0,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,1,0,0,0,0,0,0,0,0,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,300000.0,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,16,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,0,0,0,0,0,0,0,1,0,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,300000.0,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,17,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,0,0,0,0,0,0,0,0,1,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,513485.0,0,14,0,1,1,1,1.0,0,1,1,0,0,0,0,1,1,1,1,16,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5b598fbc9,0,1,0,1,16.0,0,0,0,0,0,0,0,1,0,1,1.0,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,0,0,0,1,1,0,1,1,1,1,11,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1e2fc704e,1,0,0,1,11.0,0,0,0,0,1,0,0,0,0,2,0.5,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,18,121,324,1,0,1,0.25,64.0,121.0,324


In [0]:
# Partition explanatory and response variables
Id = df_test.Id
X_test = df_test.drop(columns=['Id','idhogar']) # Dropping identifiers and target
# X_trim_test = X[['meaneduc', 
#             'SQBmeaned', 
#             'SQBdependency', 
#             'SQBedjefe', 
#             'overcrowding', 
#             'SQBovercrowding', 
#             'qmobilephone', 
#             'SQBhogar_nin', 
#             'hogar_nin', 
#             'rooms']] 
# X = pd.DataFrame(df.meaneduc)
#print(X_kaggle_test.shape)

#y_kaggle_test = kaggle_test_df['Target']
#print(y.shape)
#X_kaggle_test

In [0]:
#random
#Target = best_random.predict(X_trim_test)

#grid_search
Target = best_grid_search.predict(X_test)


In [0]:
Id = pd.Series(Id) 

In [0]:
Target = pd.Series(Target)
Target

0        4
1        4
2        4
3        4
4        4
5        4
6        4
7        4
8        4
9        4
10       4
11       4
12       4
13       4
14       4
15       4
16       4
17       4
18       4
19       4
20       4
21       4
22       4
23       4
24       4
25       4
26       4
27       4
28       4
29       4
        ..
23826    2
23827    2
23828    2
23829    2
23830    4
23831    4
23832    4
23833    4
23834    2
23835    2
23836    2
23837    2
23838    2
23839    2
23840    2
23841    2
23842    2
23843    4
23844    4
23845    4
23846    2
23847    2
23848    2
23849    2
23850    2
23851    2
23852    2
23853    2
23854    2
23855    2
Length: 23856, dtype: int64

In [0]:
kaggle_submit = pd.concat([Id, Target], axis=1)

In [0]:
kaggle_submit.columns = ['Id', 'Target']

In [0]:
kaggle_submit

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4
5,ID_e77d38d45,4
6,ID_3c5f4bd51,4
7,ID_a849c29bd,4
8,ID_472fa82da,4
9,ID_24864adcc,4
