# DSPT2 Predictive Modeling Challenge

Ross Glasmann

In [81]:
#imports
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.preprocessing import StandardScaler


In [14]:
#Reading in data
this_dir = os.getcwd()

train_features_relpath = '/data/train_features.csv'
train_labels_relpath = '/data/train_labels.csv'
test_features_relpath = '/data/test_features.csv'
sample_submission_relpath = '/data/sample_submission.csv'

train_features_path = this_dir + train_features_relpath
train_labels_path = this_dir + train_labels_relpath
test_features_path = this_dir + test_features_relpath
sample_submission_path = this_dir + sample_submission_relpath


train_features = pd.read_csv(train_features_path)
train_labels = pd.read_csv(train_labels_path)
test_features = pd.read_csv(test_features_path)
sample_submission = pd.read_csv(sample_submission_path)

In [15]:
#Verify the data is correct
assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

In [17]:
train_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [19]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [20]:
y_train = train_labels['status_group']
y_train.value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [31]:
#Baseline Accuracy Score

baseline_majority_class = y_train.mode()[0]
baseline_y_pred = [baseline_majority_class] * len(y_train)

accuracy_score(y_train, baseline_y_pred)

0.543080808080808

In [33]:
# Train, test split for training and validation before submission

X = train_features
y = train_labels['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, stratify=y)

In [50]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((44550, 40), (14850, 40), (44550,), (14850,))

In [36]:
#verify training and test data is distibuted evenly
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

functional                 0.543075
non functional             0.384242
functional needs repair    0.072682
Name: status_group, dtype: float64
functional                 0.543098
non functional             0.384242
functional needs repair    0.072660
Name: status_group, dtype: float64


In [58]:
# filter out numeric features to test an initial model

X_train_numeric = X_train[X_train.select_dtypes('number').columns.drop('id').tolist()]
X_test_numeric = X_test[X_test.select_dtypes('number').columns.drop('id').tolist()]

In [59]:
#convert y train labels into ints 

y_train_int = (y_train == 'functional').astype(int)
y_test_int = (y_test == 'functional').astype(int)

y_train_int.shape, y_test_int.shape

((44550,), (14850,))

In [60]:
#Initial model with only numeric features and NO catigorical features

regr = RandomForestRegressor(
    n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True,
    n_jobs=-1)
regr.fit(X_train_numeric, y_train_int)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [61]:
regr.score(X_test_numeric, y_test_int)

0.24568643189093242

In [62]:
X_train.describe(exclude='number').T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
recorded_by,44550,1,GeoData Consultants Ltd,44550
public_meeting,42098,2,True,38332
permit,42247,2,True,29136
source_class,44550,3,groundwater,34389
management_group,44550,5,user-group,39384
quantity_group,44550,5,enough,24915
quantity,44550,5,enough,24915
waterpoint_type_group,44550,6,communal standpipe,25993
quality_group,44550,6,good,38107
payment_type,44550,7,never pay,19035


In [78]:
#Encode and test catigorical features

num_feats = X_train.select_dtypes('number').columns.drop('id').tolist()
catigor_feats = ['extraction_type_class']

features = num_feats + catigor_feats

X_train_subset = X_train[features]
X_test_subset = X_test[features]

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train_subset)
X_test_encoded = encoder.transform(X_test_subset)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [79]:
regr = regr = RandomForestRegressor(
    n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True,
    n_jobs=-1)

regr.fit(X_train_scaled, y_train_int)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [80]:
#This model did not work very well
regr.score(X_test_scaled, y_test_int)

0.2875912605628729

In [82]:

clf = RandomForestClassifier(n_estimators=110, max_depth=19, random_state=3)

clf.fit(X_train_scaled, y_train_int)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=110,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)

In [83]:
clf.score(X_test_scaled, y_test_int)

0.7741414141414141