In [None]:
import pandas as pd
import numpy as np 

# Cleaning functions

In [35]:
#Fix dates
def fix_dates(x):

    #Import
    import datetime
    now = datetime.datetime.today()

    #Turn string to datetime
    x.date_recorded = pd.to_datetime(x['date_recorded'],format = '%Y-%m-%d')

    #Turn date into how long ago it happened
    x['age'] = x['date_recorded'] - now

    #sklearn doesn't like time. Turn it into an int
    x['age'] = x['age'].dt.days
    
    return x

#remove the columns that we don't want
def drop_stuff(x):
    x = x.drop(to_drop, axis=1)

    return x

#label NaNs
def label_nans(x):
    x.funder.fillna('unknown', inplace=True)
    x.permit.fillna('unknown', inplace=True)
    x.installer.fillna('unknown', inplace=True)
    x.subvillage.fillna('unknown', inplace=True)
    x.scheme_name.fillna('unknown', inplace=True)
    x.public_meeting.fillna('unknown', inplace=True)
    x.scheme_management.fillna('unknown', inplace=True)

    return x

#Clean Data
def clean_data(x):
    x = label_nans(x)
    x = fix_dates(x)
    x = drop_stuff(x)
    
    return x

# Utility Lists

In [195]:
#list of categorical variables
categoricals = ['basin', 'region', 'district_code',
           'lga', 'public_meeting', 'scheme_management', 'permit',
           'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management',
           'management_group', 'payment_type', 'water_quality', 'quality_group', 'quantity',
           'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']

#drop categories that are excessive, or drop redundant
to_drop = []

# Begin Modeling

In [192]:
#load data
pd.set_option('display.max_columns', 500)

X = pd.read_csv('train_features.csv')

y = pd.read_csv('train_labels.csv')

#status_group is the actualy y target
y = y.status_group

#Data Prep
#X_clean = clean_data(X)


In [193]:
X.corr()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
id,1.0,-0.005321,-0.004692,-0.001348,0.001718,-0.002629,-0.003028,-0.003044,-0.002813,-0.002082
amount_tsh,-0.005321,1.0,0.07665,0.022134,-0.05267,0.002944,-0.026813,-0.023599,0.016288,0.067915
gps_height,-0.004692,0.07665,1.0,0.149155,-0.035751,0.007237,-0.183521,-0.171233,0.135003,0.658727
longitude,-0.001348,0.022134,0.149155,1.0,-0.425802,0.023873,0.034197,0.151398,0.08659,0.396732
latitude,0.001718,-0.05267,-0.035751,-0.425802,1.0,0.006837,-0.221018,-0.20102,-0.022152,-0.245278
num_private,-0.002629,0.002944,0.007237,0.023873,0.006837,1.0,-0.020377,-0.004478,0.003818,0.026056
region_code,-0.003028,-0.026813,-0.183521,0.034197,-0.221018,-0.020377,1.0,0.678602,0.094088,0.031724
district_code,-0.003044,-0.023599,-0.171233,0.151398,-0.20102,-0.004478,0.678602,1.0,0.061831,0.048315
population,-0.002813,0.016288,0.135003,0.08659,-0.022152,0.003818,0.094088,0.061831,1.0,0.26091
construction_year,-0.002082,0.067915,0.658727,0.396732,-0.245278,0.026056,0.031724,0.048315,0.26091,1.0


In [38]:
#Import Pipeline and associated tools
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [194]:
for i in list(X):
    print(i, ' ', X[i].nunique())

id   59400
amount_tsh   98
date_recorded   356
funder   1897
gps_height   2428
installer   2145
longitude   57516
latitude   57517
wpt_name   37400
num_private   65
basin   9
subvillage   19287
region   21
region_code   27
district_code   20
lga   125
ward   2092
population   1049
public_meeting   2
recorded_by   1
scheme_management   12
scheme_name   2696
permit   2
construction_year   55
extraction_type   18
extraction_type_group   13
extraction_type_class   7
management   12
management_group   5
payment   7
payment_type   7
water_quality   8
quality_group   6
quantity   5
quantity_group   5
source   10
source_type   7
source_class   3
waterpoint_type   7
waterpoint_type_group   6


In [197]:
X_clean = clean_data(X)[['id','waterpoint_type_group','waterpoint_type','source_class','source_type',
                         'source','quantity','quality_group','water_quality','payment_type',
                        'management_group','management','extraction_type_class','extraction_type_group',
                        'extraction_type','construction_year','permit','scheme_management','public_meeting',
                        'population','lga','district_code','region_code','region','basin','num_private','date_recorded','amount_tsh']]

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

X_clean['date_recorded'] = encoder.fit_transform(X_clean['date_recorded'])


In [106]:
X_clean.head()

Unnamed: 0,id,waterpoint_type_group,waterpoint_type,source_class,source_type,source,quantity,quality_group,water_quality,payment_type,management_group,management,extraction_type_class,extraction_type_group,extraction_type,construction_year,permit,scheme_management,public_meeting,population,lga,region,basin,num_private,date_recorded,amount_tsh
0,69572,communal standpipe,communal standpipe,groundwater,spring,spring,enough,good,soft,annually,user-group,vwc,gravity,gravity,gravity,1999,False,VWC,True,109,Ludewa,Iringa,Lake Nyasa,0,47,6000.0
1,8776,communal standpipe,communal standpipe,surface,rainwater harvesting,rainwater harvesting,insufficient,good,soft,never pay,user-group,wug,gravity,gravity,gravity,2010,True,Other,unknown,280,Serengeti,Mara,Lake Victoria,0,309,0.0
2,34310,communal standpipe,communal standpipe multiple,surface,dam,dam,enough,good,soft,per bucket,user-group,vwc,gravity,gravity,gravity,2009,True,VWC,True,250,Simanjiro,Manyara,Pangani,0,300,25.0
3,67743,communal standpipe,communal standpipe multiple,groundwater,borehole,machine dbh,dry,good,soft,never pay,user-group,vwc,submersible,submersible,submersible,1986,True,VWC,True,58,Nanyumbu,Mtwara,Ruvuma / Southern Coast,0,272,0.0
4,19728,communal standpipe,communal standpipe,surface,rainwater harvesting,rainwater harvesting,seasonal,good,soft,never pay,other,other,gravity,gravity,gravity,0,True,unknown,True,0,Karagwe,Kagera,Lake Victoria,0,104,0.0


In [198]:
#Split Data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_clean.iloc[:,1:],y)

In [199]:
#Create pipeline
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    StandardScaler(),
    RandomForestClassifier(verbose=True)    
)

#Create model and test on validation set
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)

print(accuracy_score(y_valid, y_pred))

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.1s finished


0.7773063973063973


  Xt = transform.transform(Xt)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [213]:
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [214]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_202"; Java(TM) SE Runtime Environment (build 1.8.0_202-b08); Java HotSpot(TM) 64-Bit Server VM (build 25.202-b08, mixed mode)
  Starting server from /anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/ht/whbj74ys4mng7gx5l6_3s6c00000gn/T/tmprztvpoae
  JVM stdout: /var/folders/ht/whbj74ys4mng7gx5l6_3s6c00000gn/T/tmprztvpoae/h2o_peytonrunyan_started_from_python.out
  JVM stderr: /var/folders/ht/whbj74ys4mng7gx5l6_3s6c00000gn/T/tmprztvpoae/h2o_peytonrunyan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_peytonrunyan_duv1gb
H2O cluster total nodes:,1
H2O cluster free memory:,1.778 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [241]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y_encoded = encoder.fit_transform(y)
X_train, X_valid, y_train, y_valid = train_test_split(X_clean.iloc[:,1:],y_encoded)

In [242]:
y_encoded = pd.DataFrame(y_encoded, columns=['target'])

In [243]:
Data = pd.concat([X_clean,y_encoded], axis=1)

In [244]:
print(Data.columns)

Index(['id', 'waterpoint_type_group', 'waterpoint_type', 'source_class',
       'source_type', 'source', 'quantity', 'quality_group', 'water_quality',
       'payment_type', 'management_group', 'management',
       'extraction_type_class', 'extraction_type_group', 'extraction_type',
       'construction_year', 'permit', 'scheme_management', 'public_meeting',
       'population', 'lga', 'district_code', 'region_code', 'region', 'basin',
       'num_private', 'date_recorded', 'amount_tsh', 'target'],
      dtype='object')


In [245]:
Data.head()

Unnamed: 0,id,waterpoint_type_group,waterpoint_type,source_class,source_type,source,quantity,quality_group,water_quality,payment_type,management_group,management,extraction_type_class,extraction_type_group,extraction_type,construction_year,permit,scheme_management,public_meeting,population,lga,district_code,region_code,region,basin,num_private,date_recorded,amount_tsh,target
0,69572,communal standpipe,communal standpipe,groundwater,spring,spring,enough,good,soft,annually,user-group,vwc,gravity,gravity,gravity,1999,False,VWC,True,109,Ludewa,5,11,Iringa,Lake Nyasa,0,47,6000.0,0
1,8776,communal standpipe,communal standpipe,surface,rainwater harvesting,rainwater harvesting,insufficient,good,soft,never pay,user-group,wug,gravity,gravity,gravity,2010,True,Other,unknown,280,Serengeti,2,20,Mara,Lake Victoria,0,309,0.0,0
2,34310,communal standpipe,communal standpipe multiple,surface,dam,dam,enough,good,soft,per bucket,user-group,vwc,gravity,gravity,gravity,2009,True,VWC,True,250,Simanjiro,4,21,Manyara,Pangani,0,300,25.0,0
3,67743,communal standpipe,communal standpipe multiple,groundwater,borehole,machine dbh,dry,good,soft,never pay,user-group,vwc,submersible,submersible,submersible,1986,True,VWC,True,58,Nanyumbu,63,90,Mtwara,Ruvuma / Southern Coast,0,272,0.0,2
4,19728,communal standpipe,communal standpipe,surface,rainwater harvesting,rainwater harvesting,seasonal,good,soft,never pay,other,other,gravity,gravity,gravity,0,True,unknown,True,0,Karagwe,1,18,Kagera,Lake Victoria,0,104,0.0,0


In [246]:
hf = h2o.H2OFrame(Data)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [253]:
column_names = Data.columns.to_list()
X_cols = column_names[:-1]
y_col = column_names[-1]

In [257]:
# Split data into train and testing
train, test = hf.split_frame(ratios=[0.8])

In [258]:
# Define model
model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)

# train model
model.train(x=X_cols, y=y_col, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [260]:
# Model performance
performance = model.model_performance(test_data=test)
print(performance)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 0.41552416443748214
RMSE: 0.6446116384595318
MAE: 0.4452197552870305
RMSLE: 0.37089217793717333
Mean Residual Deviance: 0.41552416443748214



In [111]:
#Try on Test data
X_test = pd.read_csv('test_features.csv')
X_test = clean_data(X_test)[['id','waterpoint_type_group','waterpoint_type','source_class','source_type',
                         'source','quantity','quality_group','water_quality','payment_type',
                        'management_group','management','extraction_type_class','extraction_type_group',
                        'extraction_type','construction_year','permit','scheme_management','public_meeting',
                        'population','lga','region','basin','num_private','date_recorded','amount_tsh']]



from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

X_test['date_recorded']= encoder.fit_transform(X_test['date_recorded'])


y_test_pred = pipeline.predict(X_test.iloc[:,1:])

  Xt = transform.transform(Xt)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [112]:
###### Prep for submission
y_submit = pd.DataFrame(y_test_pred)
y_submit['id'] = X_test['id']
y_submit.columns = ['status_group','id']
y_submit = y_submit[['id','status_group']]

### Export baseline for submission
---
---

In [114]:
#Export for submission
y_submit.to_csv(r'submission4.csv',index = False)

---
---

# Let's Test Some Automated Feature Engineering Now

In [116]:
#create playground dataframe
new_X = clean_data(X)[['id','waterpoint_type_group','waterpoint_type','source_class','source_type',
                         'source','quantity','quality_group','water_quality','payment_type',
                        'management_group','management','extraction_type_class','extraction_type_group',
                        'extraction_type','construction_year','permit','scheme_management','public_meeting',
                        'population','lga','region','basin','num_private','date_recorded','amount_tsh']]
new_X = new_X.copy()

In [117]:
#let's get started
import featuretools as ft

In [118]:
es = ft.EntitySet('Entity Set')

In [119]:
es.entity_from_dataframe(dataframe=new_X,
                        entity_id='entity_1',
                        index='id')

Entityset: Entity Set
  Entities:
    entity_1 [Rows: 59400, Columns: 26]
  Relationships:
    No relationships

In [120]:
from featuretools.primitives import make_trans_primitive
from featuretools.variable_types import Numeric

# Create two new functions for our two new primitives

def Log(column):
    return np.log(column)

def Square_Root(column):
    return np.sqrt(column)

# Create the primitives
log_prim = make_trans_primitive(
    function=Log, input_types=[Numeric], return_type=Numeric)

square_root_prim = make_trans_primitive(
    function=Square_Root, input_types=[Numeric], return_type=Numeric)

In [127]:
trans_primitives=[
        'percentile', 'isin', 'cum_mean', 'subtract', 'divide'
]

trans_primitives.append(log_prim)
trans_primitives.append(square_root_prim)

In [128]:
agg_primitives=[
        'std', 'min', 'max', 'mean'
]

In [129]:
fm, features = ft.dfs(entityset=es,
                     target_entity='entity_1',
                     trans_primitives=trans_primitives,
                     agg_primitives=agg_primitives,
                     verbose=True) 

Built 562 features
Elapsed: 00:09 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


In [130]:
fm = fm.replace([np.inf, -np.inf], np.nan)
fm = fm.dropna(axis=1)

In [149]:
len(fm.select_dtypes(include='object', exclude='bool').columns)

20

In [151]:
len(fm.select_dtypes(include='number', exclude='bool').columns)

152

In [None]:
#construction dictionary of data types
f = ft.variable_types

var_types = {'amount_tsh':ft.variable_types.Numeric, 'num_private':f.Numeric, 'basin':f.Categorical, 'region':f.Categorical,
'district_code':f.Categorical, 'lga':f.Categorical, 'population':f.Numeric, 'public_meeting':f.Categorical, 
'scheme_management':f.Categorical, 'permit':f.Categorical, 'construction_year':f.Datetime, 'extraction_type':f.Categorical,
'extraction_type_group':f.Categorical, 'extraction_type_class':f.Categorical,'management':f.Categorical, 
'management_group':f.Categorical, 'payment_type':f.Categorical, 'water_quality':f.Categorical, 'quality_group':f.Categorical,
'quantity':f.Categorical, 'source':f.Categorical, 'source_type': f.Categorical, 'source_class':f.Categorical, 
'waterpoint_type':f.Categorical, 'waterpoint_type_group':f.Categorical, 'age':f.Numeric, 'lat_lng':f.LatLong, 'age_since_built':f.Datetime
}

### Dimension reduction (categorical and numeric)

In [150]:
import prince
mca = prince.MCA(
    n_components=15,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)
mca = mca.fit(fm.select_dtypes(include='object', exclude='bool').astype(str))

In [154]:
pca = prince.PCA(
    n_components=50,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
)
pca = pca.fit(fm.select_dtypes(include='number', exclude='bool').astype(np.int64))

In [190]:
X_cats = mca.transform(fm.select_dtypes(include='object', exclude='bool').astype(str))
X_nums = pca.transform(fm.select_dtypes(include='number', exclude='bool').astype(np.int64))

X_nums = X_nums.reset_index(drop=True)
X_cats = X_cats.reset_index(drop=True)
X_data = pd.concat([X_nums,X_cats], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X_data, y)



In [191]:
#Create pipeline
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    StandardScaler(),
    RandomForestClassifier(verbose=True)    
)

#Create model and test on validation set
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)

print(accuracy_score(y_valid, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.48505050505050507


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [300]:
from sklearn.ensemble import RandomForestClassifier

In [134]:
#Create pipeline
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    StandardScaler(),
    LogisticRegression()    
)

#Create model and test on validation set
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)

print(accuracy_score(y_valid, y_pred))

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


0.5388552188552188


  Xt = transform.transform(Xt)
