# Random Forest Tuning




## Imports




In [0]:
# General tools
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None

# Classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, Lasso, LassoCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import io
from sklearn.metrics import f1_score
from pprint import pprint



In [0]:
#Google COLAB file import
from google.colab import files
trainUpload = files.upload()

Saving train.csv to train.csv


In [0]:
#Google COLAB file import
from google.colab import files
testUpload = files.upload()

Saving test.csv to test.csv


# Cleaning

In [0]:
df = pd.read_csv(io.BytesIO(trainUpload['train.csv']))


In [0]:
# 5 columns with NaN values
cols_nan = list(df.columns[df.isna().any()])
cols_nan

['v2a1', 'v18q1', 'rez_esc', 'meaneduc', 'SQBmeaned']

In [0]:
# Total number of missing values
df[cols_nan].isna().sum()

v2a1         6860
v18q1        7342
rez_esc      7928
meaneduc        5
SQBmeaned       5
dtype: int64

In [0]:
# Proportion of missing values
df[cols_nan].isna().sum() / df.shape[0]

v2a1         0.717798
v18q1        0.768233
rez_esc      0.829549
meaneduc     0.000523
SQBmeaned    0.000523
dtype: float64

In [0]:
df.drop('v2a1', axis=1, inplace=True)

In [0]:
df.v18q1.fillna(0, inplace=True)


In [0]:
df.v18q1.fillna(0, inplace=True)

In [0]:
df.drop("rez_esc", axis=1, inplace=True)

In [0]:
educ_by_household = df[df.age >= 18].groupby('idhogar')['escolari'].mean()

def missing_educ(x, SQ=False):
    
    if SQ == False:
        col = 'meaneduc'
    else:
        col = 'SQBmeaned'
        
    # Find missing value ids
    missing_ids = x[x[col].isna()][col].keys()
    
    # Iterate over missing values and set them to correct value
    for i in missing_ids:
        if SQ == False:
            household = x.loc[i, 'idhogar']
            x.loc[i, col] = educ_by_household[household]
        else:
            avg = x.loc[i, 'meaneduc']
            x.loc[i, col] = avg ** 2
            
    return x
        
df = missing_educ(df)
df.meaneduc.isna().sum()

0

In [0]:
df = missing_educ(df, SQ=True)
df.SQBmeaned.isna().sum()

0

In [0]:
df.isna().sum().sum()

0

In [0]:
print("Columns before:", df.shape[1])
df = df.drop(columns=['edjefe', 'edjefa', 'dependency'])
print("Columns after:", df.shape[1])

Columns before: 141
Columns after: 138


In [0]:
col_names = list(df.columns.values)
squared_cols = []

# Get only columns starting with "SQB"
for col in col_names:
    if "SQB" in col:
        squared_cols.append(col)
squared_cols

['SQBescolari',
 'SQBage',
 'SQBhogar_total',
 'SQBedjefe',
 'SQBhogar_nin',
 'SQBovercrowding',
 'SQBdependency',
 'SQBmeaned']

In [0]:
non_squared_cols = []

# Get col names of SQB non-squared counterparts
for col in squared_cols:
    if col == "SQBmeaned":
        non_squared_cols.append(col[3:] + "uc")
    else:
        non_squared_cols.append(col[3:])
non_squared_cols

['escolari',
 'age',
 'hogar_total',
 'edjefe',
 'hogar_nin',
 'overcrowding',
 'dependency',
 'meaneduc']

In [0]:
df.shape

(9557, 138)

In [0]:
# Partition explanatory and response variables
X = df.drop(columns=['Target', 'Id', 'idhogar']) # Dropping identifiers and target
# X = pd.DataFrame(df.meaneduc)
print(X.shape)

y = df['Target']
print(y.shape)

(9557, 135)
(9557,)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7645, 135), (1912, 135), (7645,), (1912,))

# Random Forest Classifier (First Run Full Model)

In [0]:
clf_RF = RandomForestClassifier()

In [0]:
clf_RF.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred_RF = clf_RF.predict(X_test)
clf_RF.score(X_test, y_test)

0.9079497907949791

In [0]:
# test on our test data
f1_score(y_test, y_pred_RF, average='macro')

0.8620346260585411

# Random Search Training

In [0]:

pprint(clf_RF.get_params())

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [0]:
from sklearn.model_selection import RandomizedSearchCV

In [0]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [0]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [0]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [0]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf_RF, 
                               param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [0]:
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 42.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=10,
                                                    n_jobs=None,
  

In [0]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [0]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)


Model Performance
Average Error: 0.1512 degrees.
Accuracy = 92.29%.


In [0]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 0.0753 degrees.
Accuracy = 95.76%.


In [0]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 3.75%.


In [0]:
y_pred_best_RF = best_random.predict(X_test)
clf_RF.score(X_test, y_test)

0.9079497907949791

In [0]:
# test on our test data
f1_score(y_test, y_pred_best_RF, average='macro')

0.9217463207378762

# Grid Search Training

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
grid_search = GridSearchCV(estimator = clf_RF, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [0]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 39.3min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 44.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='wa

In [0]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
feature_importances = pd.DataFrame(grid_search.best_estimator_.feature_importances_,
                                   index = X_test.columns,
                                   columns=['importance']).sort_values('importance', 
                                   ascending=False)
feature_importances

Unnamed: 0,importance
meaneduc,0.054239
SQBmeaned,0.053740
SQBdependency,0.035577
SQBedjefe,0.028174
overcrowding,0.027343
SQBovercrowding,0.025996
qmobilephone,0.025854
SQBhogar_nin,0.023966
hogar_nin,0.022902
rooms,0.021951


# Test on Kaggle Sample Test Data

In [0]:
kaggle_test_df = pd.read_csv(io.BytesIO(testUpload['test.csv']))

In [0]:
# 5 columns with NaN values
cols_nan = list(kaggle_test_df.columns[kaggle_test_df.isna().any()])
cols_nan

['v2a1', 'v18q1', 'rez_esc', 'meaneduc', 'SQBmeaned']

In [0]:
# Total number of missing values
kaggle_test_df[cols_nan].isna().sum()

v2a1         17403
v18q1        18126
rez_esc      19653
meaneduc        31
SQBmeaned       31
dtype: int64

In [0]:
kaggle_test_df.drop('v2a1', axis=1, inplace=True)

In [0]:
kaggle_test_df.v18q1.fillna(0, inplace=True)

In [0]:
kaggle_test_df.drop("rez_esc", axis=1, inplace=True)

In [0]:
educ_by_household = kaggle_test_df[kaggle_test_df.age >= 18].groupby('idhogar')['escolari'].mean()

def missing_educ(x, SQ=False):
    
    if SQ == False:
        col = 'meaneduc'
    else:
        col = 'SQBmeaned'
        
    # Find missing value ids
    missing_ids = x[x[col].isna()][col].keys()
    print(missing_ids)
    
    # Iterate over missing values and set them to correct value
    
    for i in missing_ids:
        if SQ == False:
            household = x.loc[i, 'idhogar']
            try:
              x.loc[i, col] = educ_by_household[household]
            except:
              x.loc[i, col] = 0
              
        else:
            avg = x.loc[i, 'meaneduc']
            try:
              x.loc[i, col] = avg ** 2
            except:
              x.loc[i, col] = 0
    
    return x
        
kaggle_test_df = missing_educ(kaggle_test_df)
kaggle_test_df.meaneduc.isna().sum()

Int64Index([    4,   535,   536,   537,  2612,  2613,  6809,  6810,  7266,
             7267, 15808, 15809, 15810, 16096, 16097, 16291, 19421, 19985,
            21136, 21137, 21644, 21645, 21824, 21825, 21826, 21827, 22125,
            22126, 22336, 22337, 22338],
           dtype='int64')


0

In [0]:
kaggle_test_df = missing_educ(kaggle_test_df, SQ=True)
kaggle_test_df.SQBmeaned.isna().sum()

Int64Index([    4,   535,   536,   537,  2612,  2613,  6809,  6810,  7266,
             7267, 15808, 15809, 15810, 16096, 16097, 16291, 19421, 19985,
            21136, 21137, 21644, 21645, 21824, 21825, 21826, 21827, 22125,
            22126, 22336, 22337, 22338],
           dtype='int64')


0

In [0]:
kaggle_test_df.isna().sum()

Id                 0
hacdor             0
rooms              0
hacapo             0
v14a               0
refrig             0
v18q               0
v18q1              0
r4h1               0
r4h2               0
r4h3               0
r4m1               0
r4m2               0
r4m3               0
r4t1               0
r4t2               0
r4t3               0
tamhog             0
tamviv             0
escolari           0
hhsize             0
paredblolad        0
paredzocalo        0
paredpreb          0
pareddes           0
paredmad           0
paredzinc          0
paredfibras        0
paredother         0
pisomoscer         0
                  ..
instlevel9         0
bedrooms           0
overcrowding       0
tipovivi1          0
tipovivi2          0
tipovivi3          0
tipovivi4          0
tipovivi5          0
computer           0
television         0
mobilephone        0
qmobilephone       0
lugar1             0
lugar2             0
lugar3             0
lugar4             0
lugar5       

In [0]:
print("Columns before:", kaggle_test_df.shape[1])
kaggle_test_df = kaggle_test_df.drop(columns=['edjefe', 'edjefa', 'dependency'])
print("Columns after:", kaggle_test_df.shape[1])

Columns before: 140
Columns after: 137


In [0]:

kaggle_test_df.loc[kaggle_test_df['idhogar'] == 'c31f9f3a0']

Unnamed: 0,Id,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,r4h3,r4m1,r4m2,r4m3,r4t1,r4t2,r4t3,tamhog,tamviv,escolari,hhsize,paredblolad,paredzocalo,paredpreb,pareddes,paredmad,paredzinc,paredfibras,paredother,pisomoscer,pisocemento,pisoother,pisonatur,pisonotiene,pisomadera,techozinc,techoentrepiso,techocane,techootro,cielorazo,abastaguadentro,abastaguafuera,abastaguano,public,planpri,noelec,coopele,sanitario1,sanitario2,sanitario3,sanitario5,sanitario6,energcocinar1,energcocinar2,energcocinar3,energcocinar4,elimbasu1,elimbasu2,elimbasu3,elimbasu4,elimbasu5,elimbasu6,epared1,epared2,epared3,etecho1,etecho2,etecho3,eviv1,eviv2,eviv3,dis,male,female,estadocivil1,estadocivil2,estadocivil3,estadocivil4,estadocivil5,estadocivil6,estadocivil7,parentesco1,parentesco2,parentesco3,parentesco4,parentesco5,parentesco6,parentesco7,parentesco8,parentesco9,parentesco10,parentesco11,parentesco12,idhogar,hogar_nin,hogar_adul,hogar_mayor,hogar_total,meaneduc,instlevel1,instlevel2,instlevel3,instlevel4,instlevel5,instlevel6,instlevel7,instlevel8,instlevel9,bedrooms,overcrowding,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,computer,television,mobilephone,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,lugar6,area1,area2,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
16291,ID_b3909c11b,0,4,0,1,1,0,0.0,0,1,1,0,0,0,0,1,1,1,1,9,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,c31f9f3a0,1,0,0,1,0.0,0,0,0,1,0,0,0,0,0,2,0.5,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,17,81,289,1,81,1,0.25,64.0,0.0,289


In [0]:
kaggle_test_df.shape

(23856, 137)

In [0]:
kaggle_test_df.head()

Unnamed: 0,Id,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,r4h3,r4m1,r4m2,r4m3,r4t1,r4t2,r4t3,tamhog,tamviv,escolari,hhsize,paredblolad,paredzocalo,paredpreb,pareddes,paredmad,paredzinc,paredfibras,paredother,pisomoscer,pisocemento,pisoother,pisonatur,pisonotiene,pisomadera,techozinc,techoentrepiso,techocane,techootro,cielorazo,abastaguadentro,abastaguafuera,abastaguano,public,planpri,noelec,coopele,sanitario1,sanitario2,sanitario3,sanitario5,sanitario6,energcocinar1,energcocinar2,energcocinar3,energcocinar4,elimbasu1,elimbasu2,elimbasu3,elimbasu4,elimbasu5,elimbasu6,epared1,epared2,epared3,etecho1,etecho2,etecho3,eviv1,eviv2,eviv3,dis,male,female,estadocivil1,estadocivil2,estadocivil3,estadocivil4,estadocivil5,estadocivil6,estadocivil7,parentesco1,parentesco2,parentesco3,parentesco4,parentesco5,parentesco6,parentesco7,parentesco8,parentesco9,parentesco10,parentesco11,parentesco12,idhogar,hogar_nin,hogar_adul,hogar_mayor,hogar_total,meaneduc,instlevel1,instlevel2,instlevel3,instlevel4,instlevel5,instlevel6,instlevel7,instlevel8,instlevel9,bedrooms,overcrowding,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,computer,television,mobilephone,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,lugar6,area1,area2,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,0,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,1,0,0,0,0,0,0,0,0,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,16,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,0,0,0,0,0,0,0,1,0,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,0,5,0,1,1,0,0.0,1,1,2,0,1,1,1,2,3,3,3,17,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,72958b30c,1,2,0,3,16.5,0,0,0,0,0,0,0,0,1,2,1.5,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,0,14,0,1,1,1,1.0,0,1,1,0,0,0,0,1,1,1,1,16,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5b598fbc9,0,1,0,1,16.0,0,0,0,0,0,0,0,1,0,1,1.0,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,1,0,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,0,4,0,1,1,1,1.0,0,0,0,0,1,1,0,1,1,1,1,11,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1e2fc704e,1,0,0,1,11.0,0,0,0,0,1,0,0,0,0,2,0.5,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,18,121,324,1,0,1,0.25,64.0,121.0,324


In [0]:
# Partition explanatory and response variables
Id = kaggle_test_df.Id
X_kaggle_test = kaggle_test_df.drop(columns=['Id','idhogar']) # Dropping identifiers and target


In [0]:
Target = grid_search.predict(X_kaggle_test)

In [0]:
Id = pd.Series(Id) 

In [0]:
Target = pd.Series(Target)
Target

0        4
1        4
2        4
3        4
4        4
5        4
6        4
7        4
8        4
9        4
10       4
11       4
12       4
13       4
14       4
15       4
16       4
17       4
18       4
19       4
20       4
21       4
22       4
23       4
24       4
25       4
26       4
27       4
28       4
29       4
        ..
23826    2
23827    4
23828    4
23829    4
23830    4
23831    4
23832    2
23833    2
23834    2
23835    2
23836    2
23837    2
23838    2
23839    2
23840    2
23841    2
23842    2
23843    4
23844    4
23845    4
23846    3
23847    2
23848    3
23849    3
23850    2
23851    3
23852    4
23853    4
23854    4
23855    4
Length: 23856, dtype: int64

In [0]:
kaggle_submit = pd.concat([Id, Target], axis=1)

In [0]:
kaggle_submit.columns = ['Id', 'Target']

In [0]:
kaggle_submit

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4
5,ID_e77d38d45,4
6,ID_3c5f4bd51,4
7,ID_a849c29bd,4
8,ID_472fa82da,4
9,ID_24864adcc,4
