In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
train_val = pd.read_csv('../input/trainSet.csv', index_col=False)

In [3]:
train_val['target'].head()

0    0
1    0
2    0
3    1
4    0
Name: target, dtype: int64

In [4]:
train_val.head()

Unnamed: 0,id,gps_height,longitude,latitude,region_code,district_code,construction_year,amt,amt_q3,pop1,...,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,target
0,69572,1390,34.938093,-9.856322,11,5,1999,8.699681,1,4.70048,...,0,0,0,1,0,0,0,0,0,0
1,8776,1399,34.698766,-2.147466,20,2,2010,0.0,0,5.638355,...,0,1,0,1,0,0,0,0,0,0
2,34310,686,37.460664,-3.821329,21,4,2009,3.258097,1,5.525453,...,0,1,0,0,1,0,0,0,0,0
3,67743,263,38.486161,-11.155298,90,63,1986,0.0,0,4.077537,...,0,0,0,0,1,0,0,0,0,1
4,19728,0,31.130847,-1.825359,18,1,2006,0.0,0,0.0,...,0,1,0,1,0,0,0,0,0,0


In [5]:
train_val.describe()

Unnamed: 0,id,gps_height,longitude,latitude,region_code,district_code,construction_year,amt,amt_q3,pop1,...,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,target
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,...,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,668.297239,34.077427,-5.706033,15.297003,5.629747,2000.176313,1.585235,0.274125,2.819701,...,0.001111,0.224377,0.00468,0.480168,0.102744,0.000118,0.294411,0.013199,0.107407,0.529596
std,21453.128371,693.11635,6.567432,2.946019,17.587406,9.633649,11.07285,2.636662,0.446076,2.662184,...,0.033315,0.417175,0.068252,0.499611,0.303627,0.010855,0.455781,0.114126,0.309633,0.62808
min,0.0,-90.0,0.0,-11.64944,1.0,0.0,1960.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18519.75,0.0,33.090347,-8.540621,5.0,2.0,1996.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37061.5,369.0,34.908743,-5.021597,12.0,3.0,2006.0,0.0,0.0,3.258097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,55656.5,1319.25,37.178387,-3.326156,17.0,5.0,2007.0,3.044522,1.0,5.375278,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
max,74247.0,2770.0,40.345193,-2e-08,99.0,80.0,2013.0,12.765691,1.0,10.325515,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [6]:
train_val.drop('id', axis='columns', inplace=True)

In [7]:
lista = train_val.isnull().sum()

In [8]:
X = train_val.loc[:, train_val.columns != 'target']
y = train_val.loc[:, train_val.columns == 'target']

In [9]:
classifiers = [
    #KNeighborsClassifier(),
    #DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
]

In [10]:
def model_and_test(X, y, classifiers):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 35)
    
    for model in classifiers:
        this_model = model.__class__.__name__
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f'{this_model} accuracy:')
        score = accuracy_score(y_test, y_pred)
        print(f'{score:.4f}')
        print('\n')

In [11]:
model_and_test(X, y, classifiers)

RandomForestClassifier accuracy:
0.7988


AdaBoostClassifier accuracy:
0.7246


GradientBoostingClassifier accuracy:
0.7561




In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 35)
    
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'{model} accuracy:')
score = accuracy_score(y_test, y_pred)
print(f'{score:.4f}')
print('\n')

RandomForestClassifier() accuracy:
0.7976




In [13]:
important_features = pd.DataFrame(model.feature_importances_, index = X_train.columns,
columns = ['importance']).sort_values('importance', ascending=False)

In [14]:
important_features.iloc[1:30, :]

Unnamed: 0,importance
longitude,0.129164
gps_height,0.060779
age,0.038792
construction_year,0.03861
pop1,0.038538
quantity_enough,0.037171
amt_pop,0.027148
waterpoint_type_other,0.024695
month,0.023078
district_code,0.02241
