In [3]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
sns.set_style('whitegrid')
sns.color_palette('pastel')
%matplotlib inline
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
import xgboost as xgb
#Settings
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 1000)

In [4]:
train_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_data.csv')
test_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/test_data.csv')
train_target = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_target.csv')

In [5]:
print(test_data.shape)
print(train_data.shape)
print(train_target.shape)

(14850, 40)
(59400, 40)
(59400, 2)


In [6]:
#Functions

def binary(x):
    if (x == True) or (x == 'True'):
        return 1
    elif (x == False) or (x == 'False'):
        return 0


def funder_cat(x):
    
    if x <= 1:
        return 's1'
    elif (x>1) & (x<=5):
        return 's2'
    elif (x>5) & (x<=10):
        return 's3'
    elif (x>10) & (x<=20):
        return 's4'
    elif (x>20) & (x<=50):
        return 's5'
    elif (x>50) & (x<=100):
        return 's6'
    elif (x>100) & (x<=150):
        return 's7'
    elif (x>150) & (x<=200):
        return 's8'
    elif (x>200):
        return 's9'

def installer_cat(x):
    
    if x <= 1:
        return 'i1'
    elif (x>1) & (x<=5):
        return 'i2'
    elif (x>5) & (x<=10):
        return 'i3'
    elif (x>10) & (x<=20):
        return 'i4'
    elif (x>20) & (x<=50):
        return 'i5'
    elif (x>50) & (x<=100):
        return 'i6'
    elif (x>100) & (x<=150):
        return 'i7'
    elif (x>150) & (x<=200):
        return 'i8'
    elif (x>200):
        return 'i9'


def funder_clean(data):

    funder_piv = data.pivot_table(index='funder', values='id', aggfunc=len)
    funder_df = pd.DataFrame()
    funder_df['type'] = funder_piv.index
    funder_df['count'] = funder_piv.values
    funder_df['funder_size'] = funder_df['count'].apply(lambda x: funder_cat(x))
    funder_map = dict(funder_df[['type', 'funder_size']].values)
    data['funder_size'] = data.funder.map(funder_map)
    data.funder_size.fillna('None', inplace=True)
    data = data.drop(['funder'], axis=1)
    return data


def installer_clean(data):

    installer_piv = data.pivot_table(index='installer', values=['id'], aggfunc=len)
    installer_df = pd.DataFrame()
    installer_df['type'] = installer_piv.index
    installer_df['count'] = installer_piv.values
    installer_df['installer_size'] = installer_df['count'].apply(lambda x: installer_cat(x))
    installer_map = dict(installer_df[['type', 'installer_size']].values)
    data['installer_size'] = data.installer.map(installer_map)
    data.installer_size.fillna('None', inplace=True)
    data = data.drop(['installer'], axis=1)
    return data


def date_recorded_year(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d').year

In [7]:
# Major Clean Function
def rf_prepare_data(data_input):
    
    data = data_input
    ## Handling Null Values
    # scheme name dropped
    data.drop(['scheme_name'],axis=1, inplace=True)
    
    # public_meeting NaN filled to False and cleaned
    data.public_meeting.fillna(False, inplace=True)
    data['public_meeting_binary'] = data.public_meeting.apply(binary)
    data.drop(['public_meeting'],axis=1,inplace=True)
    
    # scheme_management NaN filled to 'None' string
    data.scheme_management.fillna('None', inplace=True)
    
    # funder cleaning
    data = funder_clean(data)
    
    # installer cleaning
    data = installer_clean(data)
    
    # permit cleaning
    data.permit[(data.permit!=False)&(data.permit!=True)] = False
    data['permit_binary'] = data.permit.apply(binary)
    data.drop(['permit'],axis=1,inplace=True)
    
    # subvillage drop
    data.drop(['subvillage'],axis=1, inplace=True)
    
    ##Handling other data
    # date_recorded_year extract
    data['date_recorded_year'] = data.date_recorded.apply(lambda x: date_recorded_year(x))
    data.drop(['date_recorded'],axis=1,inplace=True)
    
    # wpt_name drop
    data.drop(['wpt_name'], axis=1, inplace=True)
    # region drop
    data.drop(['region'], axis=1, inplace=True)
    # lga drop
    data.drop(['lga'], axis=1, inplace=True)
    # ward drop
    data.drop(['ward'], axis=1, inplace=True)
    # recorded_by drop
    data.drop(['recorded_by'], axis=1, inplace=True)
    # extraction_type_drop
    data.drop(['extraction_type','extraction_type_group'], axis=1, inplace=True)
    # management drop
    data.drop(['management'], axis=1, inplace=True)
    # payment drop
    data.drop(['payment'], axis=1, inplace=True)
    # water_quality drop
    data.drop(['water_quality'], axis=1, inplace=True)
    # quantity drop
    data.drop(['quantity'], axis=1, inplace=True)
    # source drop
    data.drop(['source','source_class'],axis=1,inplace=True)
    # waterpoint_type drop
    data.drop(['waterpoint_type'], axis=1, inplace=True)
    # num_private drop
    #data.drop(['num_private'], axis=1,inplace=True)
    # drop coordinates
    #data.drop(['longitude','latitude'], axis=1,inplace=True)
    
    # Operational Years Feature Engineering
    op_years = list(data.date_recorded_year-data.construction_year)
    operational_years = []
    for i in op_years:
        if (i > 500) or (i < 0):
            operational_years.append(0)
        else:
            operational_years.append(i)
    data['operational_years'] = operational_years
#     data.drop(['date_recorded_year','construction_year'], axis=1, inplace=True)
 
    return data

In [8]:
def transform_data(data):
    
    # Transformation of variables if required or for testing
    data['amount_tsh'] = data.amount_tsh.apply(lambda x: np.log(x+1))
    data['operational_years'] = data.operational_years.apply(lambda x: np.log(x+1))
    data['population'] = data.population.apply(lambda x: np.log(x+1))
    
    return data

In [9]:
rf_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_data.csv')

In [10]:
prep_rf = rf_prepare_data(rf_data)
rf_targets = train_target.status_group

In [11]:
# col = pandas.Categorical.from_array(income["workclass"])
# income["workclass"] = col.codes
# print(income["workclass"].head(5))

iter_ = ['basin','scheme_management','extraction_type_class','management_group',
        'payment_type','quality_group','quantity_group','source_type','waterpoint_type_group',
        'funder_size','installer_size']

for idx in iter_:
    col_ = pd.Categorical.from_array(prep_rf[idx])
    prep_rf[idx]= col_.codes

prep_rf

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,basin,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,operational_years
0,69572,6000.0,1390,34.938093,-9.856322e+00,0,1,11,5,109,7,1999,0,4,0,2,1,6,1,1,9,6,0,2011,12
1,8776,0.0,1399,34.698766,-2.147466e+00,0,4,20,2,280,2,2010,0,4,2,2,2,3,1,0,6,5,1,2013,3
2,34310,25.0,686,37.460664,-3.821329e+00,0,5,21,4,250,7,2009,0,4,5,2,1,1,1,1,3,9,1,2013,4
3,67743,0.0,263,38.486161,-1.115530e+01,0,7,90,63,58,7,1986,5,4,2,2,0,0,1,1,9,9,1,2013,27
4,19728,0.0,0,31.130847,-1.825359e+00,0,4,18,1,0,1,0,0,1,2,2,3,3,1,1,1,7,1,2011,0
5,9944,20.0,0,39.172796,-4.765587e+00,0,5,4,8,1,7,2009,5,4,5,4,1,2,1,1,7,9,1,2011,2
6,19816,0.0,0,33.362410,-3.766365e+00,0,0,17,3,0,7,0,1,4,2,2,1,0,3,1,9,8,1,2012,0
7,54551,0.0,0,32.620617,-4.226198e+00,0,3,17,3,0,1,0,1,4,6,3,1,5,3,1,9,9,1,2012,0
8,53934,0.0,0,32.711100,-5.146712e+00,0,3,14,6,0,7,0,1,4,2,4,3,0,3,1,9,5,1,2012,0
9,46144,0.0,0,30.626991,-1.257051e+00,0,4,18,1,0,1,0,1,4,2,2,1,5,3,1,4,7,1,2011,0


In [12]:
prep_rf.drop(['id'],axis=1,inplace=True)
prep_rf.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,operational_years
0,6000.0,1390,34.938093,-9.856322,0,1,11,5,109,7,1999,0,4,0,2,1,6,1,1,9,6,0,2011,12
1,0.0,1399,34.698766,-2.147466,0,4,20,2,280,2,2010,0,4,2,2,2,3,1,0,6,5,1,2013,3
2,25.0,686,37.460664,-3.821329,0,5,21,4,250,7,2009,0,4,5,2,1,1,1,1,3,9,1,2013,4
3,0.0,263,38.486161,-11.155298,0,7,90,63,58,7,1986,5,4,2,2,0,0,1,1,9,9,1,2013,27
4,0.0,0,31.130847,-1.825359,0,4,18,1,0,1,0,0,1,2,2,3,3,1,1,1,7,1,2011,0


In [13]:
rf_model = RandomForestClassifier(random_state = 123, n_estimators = 100)
rf_model.fit(prep_rf, rf_targets)
predictions = rf_model.predict(prep_rf)
print(len(rf_targets[rf_targets==predictions])/len(rf_targets))

0.9957070707070707


In [14]:
kf = KFold(prep_rf.shape[0], n_folds=10,shuffle=True,random_state=123)
cvs = cross_val_score(rf_model,prep_rf, rf_targets,cv=kf)

In [15]:
print(np.mean(cvs))
cvs

0.804797979798


array([ 0.80808081,  0.80959596,  0.79983165,  0.80084175,  0.80622896,
        0.80488215,  0.81649832,  0.796633  ,  0.80117845,  0.80420875])

In [16]:
test_prep_rf = rf_prepare_data(test_data)
iter_ = ['basin','scheme_management','extraction_type_class','management_group',
        'payment_type','quality_group','quantity_group','source_type','waterpoint_type_group',
        'funder_size','installer_size']

for idx in iter_:
    col_ = pd.Categorical.from_array(test_prep_rf[idx])
    test_prep_rf[idx]= col_.codes
test_prep_rf.drop(['id'],axis=1,inplace=True)
test_prep_rf.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,operational_years
0,0.0,1996,35.290799,-4.059696,0,0,21,3,321,3,2012,3,2,2,2,3,3,5,1,6,5,1,2013,1
1,0.0,1569,36.656709,-3.309214,0,5,2,2,300,7,2000,0,4,2,2,2,6,1,1,9,9,1,2013,13
2,0.0,1567,34.767863,-5.004344,0,0,13,2,500,7,2010,3,4,2,2,2,3,5,1,0,0,0,2013,3
3,0.0,267,38.058046,-9.418672,0,7,80,43,250,7,1987,3,4,6,2,0,5,5,0,3,3,1,2013,26
4,500.0,1260,35.006123,-10.950412,0,7,10,3,60,10,2000,0,4,1,2,1,6,1,0,3,3,1,2013,13


In [33]:
col_

[i5, i9, None, i3, i3, ..., i6, i2, None, i9, i9]
Length: 14850
Categories (10, object): [None, i1, i2, i3, ..., i6, i7, i8, i9]

In [20]:
def iterate_rf(iterations,X_train,Y_train,X_final):
    kf = KFold(X_train.shape[0], n_folds=10,shuffle=True,random_state=123)
    
    pred_df = pd.DataFrame()
    accuracies = []
    
    for i in range(iterations):
        rf_model = RandomForestClassifier(n_estimators = 100)
        xtrain, xtest, ytrain, ytest = train_test_split(X_train, Y_train, test_size=0.33, random_state=123)
        
        rf_model.fit(xtrain,ytrain)
        predictions = rf_model.predict(xtest)
        acc = len(ytest[ytest==predictions])/len(ytest)
        accuracies.append(acc)
        final_predictions = rf_model.predict(X_final)
        pred_df[i] = final_predictions
    return pred_df, accuracies

# rf_df, rf_accuracies = iterate_rf(100,prep_rf, rf_targets,test_prep_rf)

In [21]:
rf_df, rf_accuracies = iterate_rf(200,prep_rf, rf_targets,test_prep_rf)


In [19]:
t_prep_rf = transform_data(prep_rf)
t_test_prep_rf = transform_data(test_prep_rf)

In [22]:
t_rf_df, t_rf_accuracies = iterate_rf(200,t_prep_rf, rf_targets, t_test_prep_rf)

In [23]:
print(rf_accuracies)
print(t_rf_accuracies)

[0.8006325885113764, 0.7992041628405264, 0.7992041628405264, 0.800734618916437, 0.800581573308846, 0.7985409652076318, 0.7999183756759515, 0.7982348739924497, 0.8001224364860728, 0.7990001020304051, 0.7996122844607693, 0.8001224364860728, 0.8006325885113764, 0.8010407101316193, 0.7983369043975105, 0.8004285277012549, 0.8012447709417406, 0.7996122844607693, 0.8009386797265585, 0.8001224364860728, 0.7985409652076318, 0.7984389348025712, 0.799561269258239, 0.8010407101316193, 0.7993061932455872, 0.7992041628405264, 0.7993061932455872, 0.8003775124987246, 0.7976226915620855, 0.800734618916437, 0.7994082236506479, 0.800275482093664, 0.799561269258239, 0.7995102540557086, 0.7992551780430568, 0.8008876645240282, 0.8003264972961943, 0.7995102540557086, 0.7990511172329354, 0.7996632996632996, 0.8001734516886032, 0.7985409652076318, 0.7993061932455872, 0.8001734516886032, 0.8000204060810121, 0.7990511172329354, 0.7975206611570248, 0.79971431486583, 0.8011427405366799, 0.8006325885113764, 0.79951

In [24]:
submission2 = pd.DataFrame()
submission2['id']=test_data['id']
submission2['status_group'] = rf_df.iloc[:,0]
submission2.to_csv('~/PYTHON/DrivenData/water_pumps/submission_2.csv',index=False)

In [25]:
submission3 = pd.DataFrame()
submission3['id']=test_data['id']
submission3['status_group'] = t_rf_df.iloc[:,0]
submission3.to_csv('~/PYTHON/DrivenData/water_pumps/submission_3.csv',index=False)

In [26]:
best_idx = rf_accuracies.index(max(rf_accuracies))
submission4 = pd.DataFrame()
submission4['id']=test_data['id']
submission4['status_group'] = t_rf_df.iloc[:,best_idx]
submission4.to_csv('~/PYTHON/DrivenData/water_pumps/submission_4.csv',index=False)

In [35]:
t_best_idx = t_rf_accuracies.index(max(t_rf_accuracies))
submission5 = pd.DataFrame()
submission5['id']=test_data['id']
submission5['status_group'] = t_rf_df.iloc[:,t_best_idx]
submission5.to_csv('~/PYTHON/DrivenData/water_pumps/submission_5.csv',index=False)

In [27]:
print(max(rf_accuracies))
print(max(t_rf_accuracies))

0.8018569533721049
0.8021120293847567


In [127]:
print(rf_df.shape)
rf_df.head()

(14850, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional
1,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional
2,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional
3,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional,non functional
4,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional,functional


In [132]:
submission = pd.DataFrame()
submission['id']=test_data['id']
submission['status_group'] = rf_df.iloc[:,0]

In [133]:
# submission.set_index(submission['id'],inplace=True)
# submission
submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [135]:
submission.to_csv('~/PYTHON/DrivenData/water_pumps/submission_1.csv',index=False)