# DSPT2 Predictive Modeling Challenge

Ross Glasmann

In [84]:
#imports
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.preprocessing import StandardScaler


In [85]:
#Reading in data
this_dir = os.getcwd()

train_features_relpath = '/data/train_features.csv'
train_labels_relpath = '/data/train_labels.csv'
test_features_relpath = '/data/test_features.csv'
sample_submission_relpath = '/data/sample_submission.csv'

train_features_path = this_dir + train_features_relpath
train_labels_path = this_dir + train_labels_relpath
test_features_path = this_dir + test_features_relpath
sample_submission_path = this_dir + sample_submission_relpath


train_features = pd.read_csv(train_features_path)
train_labels = pd.read_csv(train_labels_path)
test_features = pd.read_csv(test_features_path)
sample_submission = pd.read_csv(sample_submission_path)

In [86]:
#Verify the data is correct
assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

In [87]:
train_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [88]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [89]:
y_train = train_labels['status_group']
y_train.value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [90]:
#Baseline Accuracy Score

baseline_majority_class = y_train.mode()[0]
baseline_y_pred = [baseline_majority_class] * len(y_train)

accuracy_score(y_train, baseline_y_pred)

0.543080808080808

In [91]:
# Train, test split for training and validation before submission

X = train_features
y = train_labels['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, stratify=y)

In [92]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((47520, 40), (11880, 40), (47520,), (11880,))

In [93]:
#verify training and test data is distibuted evenly
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

functional                 0.543077
non functional             0.384238
functional needs repair    0.072685
Name: status_group, dtype: float64
functional                 0.543098
non functional             0.384259
functional needs repair    0.072643
Name: status_group, dtype: float64


In [94]:
# filter out numeric features to test an initial model

X_train_numeric = X_train[X_train.select_dtypes('number').columns.drop('id').tolist()]
X_test_numeric = X_test[X_test.select_dtypes('number').columns.drop('id').tolist()]

In [95]:
#convert y train labels into ints 

y_train_int = (y_train == 'functional').astype(int)
y_test_int = (y_test == 'functional').astype(int)

y_train_int.shape, y_test_int.shape

((47520,), (11880,))

In [96]:
#Initial model with only numeric features and NO catigorical features

regr = RandomForestRegressor(
    n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True,
    n_jobs=-1)
regr.fit(X_train_numeric, y_train_int)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [97]:
regr.score(X_test_numeric, y_test_int)

0.2565959900537833

In [98]:
X_train.describe(exclude='number').T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
recorded_by,47520,1,GeoData Consultants Ltd,47520
public_meeting,44825,2,True,40827
permit,45064,2,True,31066
source_class,47520,3,groundwater,36616
management_group,47520,5,user-group,41996
quantity_group,47520,5,enough,26467
quantity,47520,5,enough,26467
waterpoint_type_group,47520,6,communal standpipe,27715
quality_group,47520,6,good,40652
payment_type,47520,7,never pay,20258


In [99]:
#Encode and test catigorical features

num_feats = X_train.select_dtypes('number').columns.drop('id').tolist()
catigor_feats = ['extraction_type_class']

features = num_feats + catigor_feats

X_train_subset = X_train[features]
X_test_subset = X_test[features]

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train_subset)
X_test_encoded = encoder.transform(X_test_subset)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [100]:
regr = regr = RandomForestRegressor(
    n_estimators=100, max_depth=10, min_samples_split=2, bootstrap=True,
    n_jobs=-1)

regr.fit(X_train_scaled, y_train_int)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [101]:
#This model did not work very well
regr.score(X_test_scaled, y_test_int)

0.2997502540650574

In [102]:

clf = RandomForestClassifier(n_estimators=110, max_depth=19, random_state=3)

clf.fit(X_train_scaled, y_train_int)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=110,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)

In [103]:
clf.score(X_test_scaled, y_test_int)

0.7836700336700336

In [104]:
X_test_subset = test_features[features]
X_test_encoded = encoder.transform(X_test_subset)
X_test_scaled = scaler.transform(X_test_encoded)
assert all(X_test_encoded.columns == X_train_encoded.columns)

In [105]:
y_pred = clf.predict(X_test_scaled)

In [113]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.replace({1: 'functional', 0: 'non functional'}, inplace=True)

In [114]:
y_pred_df

Unnamed: 0,0
0,non functional
1,functional
2,functional
3,non functional
4,functional
...,...
14353,non functional
14354,functional
14355,functional
14356,functional


In [118]:
y_pred = y_pred_df[0].tolist()

In [122]:
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('RG_Submission_kaggle_02.csv', index=False)

In [123]:
!head RG_Submission_kaggle_02.csv

id,status_group
50785,non functional
51630,functional
17168,functional
45559,non functional
49871,functional
52449,functional
24806,functional
28965,non functional
36301,non functional
