In [3]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import roc_curve
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
#Settings
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 1000)
sns.set_style('whitegrid')
sns.color_palette('pastel')
%matplotlib inline

In [4]:
train_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_data.csv')
test_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/test_data.csv')
train_target = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_target.csv')

In [5]:
for i in train_data.columns:
    print(i)
    print(train_data[i].unique().shape[0])

id
59400
amount_tsh
98
date_recorded
356
funder
1898
gps_height
2428
installer
2146
longitude
57516
latitude
57517
wpt_name
37400
num_private
65
basin
9
subvillage
19288
region
21
region_code
27
district_code
20
lga
125
ward
2092
population
1049
public_meeting
3
recorded_by
1
scheme_management
13
scheme_name
2697
permit
3
construction_year
55
extraction_type
18
extraction_type_group
13
extraction_type_class
7
management
12
management_group
5
payment
7
payment_type
7
water_quality
8
quality_group
6
quantity
5
quantity_group
5
source
10
source_type
7
source_class
3
waterpoint_type
7
waterpoint_type_group
6


In [5]:
print(test_data.shape)
print(train_data.shape)
print(train_target.shape)

(14850, 40)
(59400, 40)
(59400, 2)


In [111]:
#Functions

def binary(x):
    if (x == True) or (x == 'True'):
        return 1
    elif (x == False) or (x == 'False'):
        return 0


def funder_cat(x):
    
    if x <= 1:
        return 's1'
    elif (x>1) & (x<=5):
        return 's2'
    elif (x>5) & (x<=10):
        return 's3'
    elif (x>10) & (x<=20):
        return 's4'
    elif (x>20) & (x<=50):
        return 's5'
    elif (x>50) & (x<=100):
        return 's6'
    elif (x>100) & (x<=150):
        return 's7'
    elif (x>150) & (x<=200):
        return 's8'
    elif (x>200) & (x<=300):
        return 's9'
    elif (x>300):
        return 's10'

def installer_cat(x):
    
    if x <= 1:
        return 'i1'
    elif (x>1) & (x<=5):
        return 'i2'
    elif (x>5) & (x<=10):
        return 'i3'
    elif (x>10) & (x<=20):
        return 'i4'
    elif (x>20) & (x<=50):
        return 'i5'
    elif (x>50) & (x<=100):
        return 'i6'
    elif (x>100) & (x<=150):
        return 'i7'
    elif (x>150) & (x<=200):
        return 'i8'
    elif (x>200) & (x<=300):
        return 'i9'
    elif (x>200):
        return 'i10'


def funder_clean(data):

    funder_piv = data.pivot_table(index='funder', values='id', aggfunc=len)
    funder_df = pd.DataFrame()
    funder_df['type'] = funder_piv.index
    funder_df['count'] = funder_piv.values
    funder_df['funder_size'] = funder_df['count'].apply(lambda x: funder_cat(x))
    funder_map = dict(funder_df[['type', 'funder_size']].values)
    data['funder_size'] = data.funder.map(funder_map)
    data.funder_size.fillna('None', inplace=True)
    data = data.drop(['funder'], axis=1)
    return data


def installer_clean(data):

    installer_piv = data.pivot_table(index='installer', values=['id'], aggfunc=len)
    installer_df = pd.DataFrame()
    installer_df['type'] = installer_piv.index
    installer_df['count'] = installer_piv.values
    installer_df['installer_size'] = installer_df['count'].apply(lambda x: installer_cat(x))
    installer_map = dict(installer_df[['type', 'installer_size']].values)
    data['installer_size'] = data.installer.map(installer_map)
    data.installer_size.fillna('None', inplace=True)
    data = data.drop(['installer'], axis=1)
    return data


def date_recorded_year(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d').year

def lga_freq(x):
    return len(train_data[(train_data['lga']==x)])/len(train_data)
def lga_rr_func(x):
    return len(concat_train_data[(concat_train_data.lga==x)&(concat_train_data['dummy_functional']==1)])/len(concat_train_data[concat_train_data.lga==x])
def lga_rr_rep(x):
    return len(concat_train_data[(concat_train_data.lga==x)&(concat_train_data['dummy_functional needs repair']==1)])/len(concat_train_data[concat_train_data.lga==x])
def lga_rr_nofunc(x):
    return len(concat_train_data[(concat_train_data.lga==x)&(concat_train_data['dummy_non functional']==1)])/len(concat_train_data[concat_train_data.lga==x])

def ward_rr_func(x):
    return len(concat_train_data[(concat_train_data.ward==x)&(concat_train_data['dummy_functional']==1)])/len(concat_train_data[concat_train_data.ward==x])
def ward_rr_rep(x):
    return len(concat_train_data[(concat_train_data.ward==x)&(concat_train_data['dummy_functional needs repair']==1)])/len(concat_train_data[concat_train_data.ward==x])
def ward_rr_nofunc(x):
    return len(concat_train_data[(concat_train_data.ward==x)&(concat_train_data['dummy_non functional']==1)])/len(concat_train_data[concat_train_data.ward==x])

def sn_rr_func(x):
    return len(concat_train_data[(concat_train_data.scheme_name==x)&(concat_train_data['dummy_functional']==1)])/len(concat_train_data[concat_train_data.scheme_name==x])
def sn_rr_rep(x):
    return len(concat_train_data[(concat_train_data.scheme_name==x)&(concat_train_data['dummy_functional needs repair']==1)])/len(concat_train_data[concat_train_data.scheme_name==x])
def sn_rr_nofunc(x):
    return len(concat_train_data[(concat_train_data.scheme_name==x)&(concat_train_data['dummy_non functional']==1)])/len(concat_train_data[concat_train_data.scheme_name==x])



def score_brackets(x):
    
    if (x>=0) & (x<=0.1):
        return 1
    elif (x>0.1) & (x<=0.2):
        return 2
    elif (x>0.2) & (x<=0.3):
        return 3
    elif (x>0.3) & (x<=0.4):
        return 4
    elif (x>0.4) & (x<=0.5):
        return 5
    elif (x>0.5) & (x<=0.6):
        return 6
    elif (x>0.6) & (x<=0.7):
        return 7
    elif (x>0.7) & (x<=0.8):
        return 8
    elif (x>0.8) & (x<=0.9):
        return 9
    elif (x>0.9):
        return 10
    else:
        return 0


In [112]:
dummy_targets = pd.get_dummies(train_target, prefix='dummy')
dummy_targets = dummy_targets[dummy_targets.columns[1:]]

In [113]:
concat_train_data = pd.concat([train_data,dummy_targets],axis=1)
concat_train_data.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,dummy_functional,dummy_functional needs repair,dummy_non functional
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1.0,0.0,0.0
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.0,0.0,0.0


In [114]:
#LGA tidied up
lg_piv = pd.DataFrame(concat_train_data.pivot_table(index='lga', values='id', aggfunc=len))
lg_piv.sort_values(by='id',ascending=False,inplace=True)
lg_piv

lg_piv['name']=lg_piv.index
lg_piv['freq']=lg_piv['name'].apply(lga_freq)
lg_piv['response_rate_functional'] = lg_piv['name'].apply(lga_rr_func)
lg_piv['response_rate_needs_repair'] = lg_piv['name'].apply(lga_rr_rep)
lg_piv['response_rate_non_functional'] = lg_piv['name'].apply(lga_rr_nofunc)
lg_piv.sort_values(by=['response_rate_functional'],ascending=False,inplace=True)
lg_piv['count'] = lg_piv['id']; lg_piv.drop(['id'],axis=1,inplace=True)
lg_piv['functional_label'] = lg_piv['response_rate_functional'].apply(score_brackets)
lg_piv['needs_repair_label'] = lg_piv['response_rate_needs_repair'].apply(score_brackets)
lg_piv['non_functional_label'] = lg_piv['response_rate_non_functional'].apply(score_brackets)
lg_map = lg_piv.iloc[:,[0,6,7,8]]
lg_map.head()
func_map = dict(lg_map[['name','functional_label']].values)
repair_map = dict(lg_map[['name','needs_repair_label']].values)
non_func_map = dict(lg_map[['name','non_functional_label']].values)

In [115]:
#ward tidied up
ward_piv = pd.DataFrame(concat_train_data.pivot_table(index='ward', values='id', aggfunc=len))
ward_piv.sort_values(by='id',ascending=False,inplace=True)
ward_piv['name']=ward_piv.index
ward_piv['response_rate_functional'] = ward_piv['name'].apply(ward_rr_func)
ward_piv['response_rate_needs_repair'] = ward_piv['name'].apply(ward_rr_rep)
ward_piv['response_rate_non_functional'] = ward_piv['name'].apply(ward_rr_nofunc)
ward_piv['functional_label'] = ward_piv['response_rate_functional'].apply(score_brackets)
ward_piv['needs_repair_label'] = ward_piv['response_rate_needs_repair'].apply(score_brackets)
ward_piv['non_functional_label'] = ward_piv['response_rate_non_functional'].apply(score_brackets)
ward_map = ward_piv.iloc[:,[1,5,6,7]]
ward_func_map = dict(ward_map[['name','functional_label']].values)
ward_repair_map = dict(ward_map[['name','needs_repair_label']].values)
ward_non_func_map = dict(ward_map[['name','non_functional_label']].values)
ward_map.head()

Unnamed: 0_level_0,name,functional_label,needs_repair_label,non_functional_label
ward,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Igosi,Igosi,10,1,1
Imalinyi,Imalinyi,10,1,1
Siha Kati,Siha Kati,10,1,1
Mdandu,Mdandu,9,1,1
Nduruma,Nduruma,7,1,3


In [85]:
concat_train_data.scheme_name.describe()

count     31234
unique     2696
top           K
freq        682
Name: scheme_name, dtype: object

In [116]:
#sn tidied up
sn_piv = pd.DataFrame(concat_train_data.pivot_table(index='scheme_name', values='id', aggfunc=len))
sn_piv.sort_values(by='id',ascending=False,inplace=True)
sn_piv

sn_piv['name']=sn_piv.index

sn_piv['response_rate_functional'] = sn_piv['name'].apply(sn_rr_func)
sn_piv['response_rate_needs_repair'] = sn_piv['name'].apply(sn_rr_rep)
sn_piv['response_rate_non_functional'] = sn_piv['name'].apply(sn_rr_nofunc)
# lg_piv.sort_values(by=['response_rate_functional'],ascending=False,inplace=True)
# lg_piv['count'] = lg_piv['id']; lg_piv.drop(['id'],axis=1,inplace=True)
sn_piv['functional_label'] = sn_piv['response_rate_functional'].apply(score_brackets)
sn_piv['needs_repair_label'] = sn_piv['response_rate_needs_repair'].apply(score_brackets)
sn_piv['non_functional_label'] = sn_piv['response_rate_non_functional'].apply(score_brackets)
sn_map = sn_piv.iloc[:,[1,5,6,7]]
sn_map.head()
sn_func_map = dict(sn_map[['name','functional_label']].values)
sn_repair_map = dict(sn_map[['name','needs_repair_label']].values)
sn_non_func_map = dict(sn_map[['name','non_functional_label']].values)

sn_map.head()

Unnamed: 0_level_0,name,functional_label,needs_repair_label,non_functional_label
scheme_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
K,K,6,2,3
,,7,1,4
Borehole,Borehole,4,1,6
Chalinze wate,Chalinze wate,9,1,2
M,M,5,2,4


##  scheme_name, scheme_management, extraction type, management, water quality, source
## drop wpt name

In [98]:
sn_map.name.describe()

count       2696
unique      2696
top       Kibena
freq           1
Name: name, dtype: object

In [117]:
# Major Clean Function
def new_prepare_data(data_input):
    
    data = data_input
    ## Handling Null Values
    
    # public_meeting NaN filled to False and cleaned
    data.public_meeting.fillna(False, inplace=True)
    data['public_meeting_binary'] = data.public_meeting.apply(binary)
    data.drop(['public_meeting'],axis=1,inplace=True)
    
    # scheme_management NaN filled to 'None' string
    data.scheme_management.fillna('None', inplace=True)
    
    # funder cleaning
    data = funder_clean(data)
    
    # installer cleaning
    data = installer_clean(data)
    
    # permit cleaning
    data.permit[(data.permit!=False)&(data.permit!=True)] = False
    data['permit_binary'] = data.permit.apply(binary)
    data.drop(['permit'],axis=1,inplace=True)
    
    # subvillage drop
    data.drop(['subvillage'],axis=1, inplace=True)
    
    ##Handling other data
    # date_recorded_year extract
    data['date_recorded_year'] = data.date_recorded.apply(lambda x: date_recorded_year(x))
    data.drop(['date_recorded'],axis=1,inplace=True)
    
    # wpt_name drop
    data.drop(['wpt_name'], axis=1, inplace=True)
    # region drop
    #data.drop(['region'], axis=1, inplace=True)
    # lga cat
    # data.drop(['lga'], axis=1, inplace=True)
    data['lga_func_label'] = data['lga'].map(func_map)
    data['lga_needs_repair'] = data['lga'].map(repair_map)
    data['lga_non_func_label'] = data['lga'].map(non_func_map)
    data.lga_func_label.fillna(0, inplace=True)
    data.lga_needs_repair.fillna(0, inplace=True)
    data.lga_non_func_label.fillna(0, inplace=True)
    data.drop(['lga'], axis=1, inplace=True)
    
    # ward cat
    data['ward_func_label'] = data['ward'].map(ward_func_map)
    data['ward_needs_repair'] = data['ward'].map(ward_repair_map)
    data['ward_non_func_label'] = data['ward'].map(ward_non_func_map)
    data.ward_func_label.fillna(0, inplace=True)
    data.ward_needs_repair.fillna(0, inplace=True)
    data.ward_non_func_label.fillna(0, inplace=True)
    data.drop(['ward'], axis=1, inplace=True)
    
    # sn cat
    data['sn_func_label'] = data['scheme_name'].map(sn_func_map)
    data['sn_needs_repair'] = data['scheme_name'].map(sn_repair_map)
    data['sn_non_func_label'] = data['scheme_name'].map(sn_non_func_map)
    data.sn_func_label.fillna(0, inplace=True)
    data.sn_needs_repair.fillna(0, inplace=True)
    data.sn_non_func_label.fillna(0, inplace=True)
    data.drop(['scheme_name'],axis=1, inplace=True)
    
    # recorded_by drop
    data.drop(['recorded_by'], axis=1, inplace=True)
    # extraction_type_drop
    data.drop(['extraction_type','extraction_type_group'], axis=1, inplace=True)
    # management drop
    data.drop(['management'], axis=1, inplace=True)
    # payment drop
    data.drop(['payment'], axis=1, inplace=True)
    # water_quality drop
    data.drop(['water_quality'], axis=1, inplace=True)
    # quantity drop
    data.drop(['quantity'], axis=1, inplace=True)
    # source drop
    data.drop(['source','source_class'],axis=1,inplace=True)
    # waterpoint_type drop
    data.drop(['waterpoint_type'], axis=1, inplace=True)
    # num_private drop
    #data.drop(['num_private'], axis=1,inplace=True)
    # drop coordinates
    #data.drop(['longitude','latitude'], axis=1,inplace=True)
    
    # Operational Years Feature Engineering
    op_years = list(data.date_recorded_year-data.construction_year)
    operational_years = []
    for i in op_years:
        if (i > 500) or (i < 0):
            operational_years.append(0)
        else:
            operational_years.append(i)
    data['operational_years'] = operational_years
#     data.drop(['date_recorded_year','construction_year'], axis=1, inplace=True)
    
    iter_ = ['region','basin','scheme_management','extraction_type_class','management_group',
        'payment_type','quality_group','quantity_group','source_type','waterpoint_type_group',
        'funder_size','installer_size']

    for idx in iter_:
        col_ = pd.Categorical.from_array(data[idx])
        data[idx]= col_.codes
    return data

In [118]:
def transform_data(data):
    
    # Transformation of variables if required or for testing
    data['amount_tsh'] = data.amount_tsh.apply(lambda x: np.log(x+1))
    data['operational_years'] = data.operational_years.apply(lambda x: np.log(x+1))
    data['population'] = data.population.apply(lambda x: np.log(x+1))
    
    return data

In [119]:
new_data = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_data.csv')
new_targets = pd.read_csv('~/PYTHON/DrivenData/water_pumps/train_target.csv')
new_final = pd.read_csv('~/PYTHON/DrivenData/water_pumps/test_data.csv')

In [120]:
prep_new_data = new_prepare_data(new_data)
new_targets_status = train_target.status_group
prep_new_data.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,lga_func_label,lga_needs_repair,lga_non_func_label,ward_func_label,ward_needs_repair,ward_non_func_label,sn_func_label,sn_needs_repair,sn_non_func_label,operational_years
0,69572,6000.0,1390,34.938093,-9.856322,0,1,3,11,5,109,7,1999,0,4,0,2,1,6,1,1,10,7,0,2011,8,1,3,7,1,4,9.0,1.0,1.0,12
1,8776,0.0,1399,34.698766,-2.147466,0,4,9,20,2,280,2,2010,0,4,2,2,2,3,1,0,7,6,1,2013,6,1,4,7,1,4,0.0,0.0,0.0,3
2,34310,25.0,686,37.460664,-3.821329,0,5,8,21,4,250,7,2009,0,4,5,2,1,1,1,1,4,2,1,2013,6,1,5,9,1,1,10.0,1.0,1.0,4
3,67743,0.0,263,38.486161,-11.155298,0,7,12,90,63,58,7,1986,5,4,2,2,0,0,1,1,2,10,1,2013,3,1,8,2,1,9,0.0,0.0,0.0,27
4,19728,0.0,0,31.130847,-1.825359,0,4,4,18,1,0,1,0,0,1,2,2,3,3,1,1,1,8,1,2011,6,1,5,9,1,2,0.0,0.0,0.0,0


##  clean region_code, district_code, lga, ward,scheme_name, scheme_management, extraction type, management, water quality, source
## drop wpt name

In [121]:
prep_new_data.drop(['id'],axis=1,inplace=True)
prep_new_data.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,lga_func_label,lga_needs_repair,lga_non_func_label,ward_func_label,ward_needs_repair,ward_non_func_label,sn_func_label,sn_needs_repair,sn_non_func_label,operational_years
0,6000.0,1390,34.938093,-9.856322,0,1,3,11,5,109,7,1999,0,4,0,2,1,6,1,1,10,7,0,2011,8,1,3,7,1,4,9.0,1.0,1.0,12
1,0.0,1399,34.698766,-2.147466,0,4,9,20,2,280,2,2010,0,4,2,2,2,3,1,0,7,6,1,2013,6,1,4,7,1,4,0.0,0.0,0.0,3
2,25.0,686,37.460664,-3.821329,0,5,8,21,4,250,7,2009,0,4,5,2,1,1,1,1,4,2,1,2013,6,1,5,9,1,1,10.0,1.0,1.0,4
3,0.0,263,38.486161,-11.155298,0,7,12,90,63,58,7,1986,5,4,2,2,0,0,1,1,2,10,1,2013,3,1,8,2,1,9,0.0,0.0,0.0,27
4,0.0,0,31.130847,-1.825359,0,4,4,18,1,0,1,0,0,1,2,2,3,3,1,1,1,8,1,2011,6,1,5,9,1,2,0.0,0.0,0.0,0


In [None]:
prep_new_data = transform_data(prep_new_data)

In [123]:
new_rf_model = RandomForestClassifier(random_state = 123, n_estimators = 100)
new_rf_model.fit(prep_new_data, new_targets_status)
predictions = new_rf_model.predict(prep_new_data)
print(len(new_targets_status[new_targets_status==predictions])/len(new_targets_status))

0.9975252525252525


In [124]:
kf = KFold(prep_new_data.shape[0], n_folds=10,shuffle=True,random_state=123)
cvs = cross_val_score(new_rf_model,prep_new_data, new_targets_status,cv=kf)

In [125]:
print(np.mean(cvs))
cvs

0.808131313131


array([ 0.81262626,  0.81178451,  0.803367  ,  0.80875421,  0.80521886,
        0.80791246,  0.81818182,  0.7993266 ,  0.80319865,  0.81094276])

In [132]:
new_prep_final = new_prepare_data(new_final)
new_prep_final = transform_data(new_prep_final)
new_prep_final.head()
new_prep_final.drop(['id'],inplace=True)

AttributeError: 'DataFrame' object has no attribute 'public_meeting'

In [136]:
new_prep_final.drop(['id'],inplace=True,axis=1)
new_prep_final.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,population,scheme_management,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,waterpoint_type_group,public_meeting_binary,funder_size,installer_size,permit_binary,date_recorded_year,lga_func_label,lga_needs_repair,lga_non_func_label,ward_func_label,ward_needs_repair,ward_non_func_label,sn_func_label,sn_needs_repair,sn_non_func_label,operational_years
0,0.0,1996,35.290799,-4.059696,0,0,8,21,3,5.774552,3,2012,3,2,2,2,3,3,5,1,7,6,1,2013,7,1,4,6.0,2.0,3.0,0.0,0.0,0.0,0.693147
1,0.0,1569,36.656709,-3.309214,0,5,0,2,2,5.70711,7,2000,0,4,2,2,2,6,1,1,2,2,1,2013,7,1,3,6.0,1.0,4.0,5.0,2.0,4.0,2.639057
2,0.0,1567,34.767863,-5.004344,0,0,18,13,2,6.216606,7,2010,3,4,2,2,2,3,5,1,0,0,0,2013,5,1,5,3.0,1.0,7.0,2.0,1.0,9.0,1.386294
3,0.0,267,38.058046,-9.418672,0,7,7,80,43,5.525453,7,1987,3,4,6,2,0,5,5,0,4,4,1,2013,3,1,8,2.0,1.0,8.0,0.0,0.0,0.0,3.295837
4,6.216606,1260,35.006123,-10.950412,0,7,16,10,3,4.110874,10,2000,0,4,1,2,1,6,1,0,4,4,1,2013,6,1,4,8.0,1.0,3.0,8.0,1.0,2.0,2.639057


In [33]:
col_

[i5, i9, None, i3, i3, ..., i6, i2, None, i9, i9]
Length: 14850
Categories (10, object): [None, i1, i2, i3, ..., i6, i7, i8, i9]

In [153]:
def iterate_new(iterations,X_train,Y_train,X_final):
    kf = KFold(X_train.shape[0], n_folds=10,shuffle=True,random_state=123)
    
    pred_df = pd.DataFrame()
    accuracies = []
    
    for i in range(iterations):
        rf_model = RandomForestClassifier(n_estimators = 100)
        xtrain, xtest, ytrain, ytest = train_test_split(X_train, Y_train, test_size=0.075, random_state=123)
        
        rf_model.fit(xtrain,ytrain)
        predictions = rf_model.predict(xtest)
        acc = len(ytest[ytest==predictions])/len(ytest)
        accuracies.append(acc)
        final_predictions = rf_model.predict(X_final)
        pred_df[i] = final_predictions
    return pred_df, accuracies

# rf_df, rf_accuracies = iterate_rf(100,prep_rf, rf_targets,test_prep_rf)

In [137]:
new_df, new_accuracies = iterate_new(200,prep_new_data, new_targets_status,new_prep_final)

In [139]:
#test size=0.33
best_idx = new_accuracies.index(max(new_accuracies))
submission_best = pd.DataFrame()
submission_best['id']=test_data['id']
submission_best['status_group'] = new_df.iloc[:,best_idx]
submission_best.to_csv('~/PYTHON/DrivenData/water_pumps/submission_best.csv',index=False)

In [143]:
new_df2, new_accuracies2 = iterate_new(200,prep_new_data, new_targets_status,new_prep_final)

In [150]:
#test size=0.15
best_idx2 = new_accuracies2.index(max(new_accuracies2))
submission_best2 = pd.DataFrame()
submission_best2['id']=test_data['id']
submission_best2['status_group'] = new_df2.iloc[:,best_idx2]
submission_best2.to_csv('~/PYTHON/DrivenData/water_pumps/submission_best2.csv',index=False)

In [151]:
print(np.mean(new_accuracies))
print(max(new_accuracies))

0.80338281808
0.8055300479542904


In [152]:
print(np.mean(new_accuracies2))
print(max(new_accuracies2))

0.815989337823
0.8191919191919191


In [154]:
new_df3, new_accuracies3 = iterate_new(200,prep_new_data, new_targets_status,new_prep_final)

In [155]:
#test size=0.075
best_idx3 = new_accuracies3.index(max(new_accuracies3))
submission_best3 = pd.DataFrame()
submission_best3['id']=test_data['id']
submission_best3['status_group'] = new_df3.iloc[:,best_idx3]
submission_best3.to_csv('~/PYTHON/DrivenData/water_pumps/submission_best3.csv',index=False)

In [156]:
print(np.mean(new_accuracies3))
print(max(new_accuracies3))

0.814250280584
0.8184062850729518
