In [1]:
#Importing all the necessary packages at the starting itself
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
import itertools
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from matplotlib import rcParams

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
#from keras.models import Sequential
#from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.model_selection import KFold


In [2]:
#Reading the training set features
df = pd.read_csv('dataset.csv')

Training set features in df dataframe contain two out of the three outputs we
are predicting as output with our algorithms (water quality and water quantity).
For each of the three output predictions, we are using the other two as input
features (i.e., water quality and functionality of pump are both predictors for
water quantity, etc.)

In [3]:
#Reading training set lables for functionality of pumps
labels=pd.read_csv('labels.csv')

In [4]:
 #adding labels to the dataframe
df['labels']=labels['status_group']

***Dropping the Repetitive features***

In [5]:
#dropping variables that were deemed repetitive/not useful
df =df.drop(columns=['id','wpt_name', 'num_private', 'subvillage', 'region', 'district_code', 'lga', 'ward', 'recorded_by', 'scheme_name', 'extraction_type_group', 'payment', 'water_quality', 'quantity', 'source_type', 'waterpoint_type_group'])


In [6]:
#creating a new variable of "age" from the construction_year and date_recorded raw features
df['date_recorded'] = df['date_recorded'].astype(str).str[:-6] #getting the year of the date recorded
df['age']=df['date_recorded'].astype(int)-df['construction_year']
ind = df['age']>2000
df['age'][ind]=np.nan #if there is no construction date set the age to NaN

df = df.drop(columns = ['date_recorded', 'construction_year'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [7]:
#editing variable from hundreds of categories to funder = 1 if village/villagers, 0 otherwise
df['funder']=df['funder'].fillna('0')
df['funder'] = df['funder'].str.contains('|'.join(['Village','village'])).astype(int) #binary


In [8]:
#places with 0 longitude and 2e-08 latitude had missing data for those 2 features
ind = df['longitude']==0.0
df['longitude'][ind]=np.nan
ind = df['latitude']==-2e-08
df['latitude'][ind]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [9]:
#similar to funder, reducint from hundreds of categories to installer = 1 if village/villagers, 0 otherwise
df['installer']=df['installer'].fillna('0')
df['installer'] = df['installer'].str.contains('|'.join(['Village','village'])).astype(int) #binary


In [10]:
#from T/F to 0/1
ind = df.notna()['public_meeting']
df['public_meeting'][ind]=df['public_meeting'][ind].astype(int)
ind = df.notna()['permit']
df['permit'][ind]=df['permit'][ind].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [11]:
#Here begins One Hot Encoding (OHE) for all remaining features
new_basin = np.asarray(df['basin']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_basin)
for num,i in enumerate(np.unique(df['basin'])):
    df['basin_'+str(i)] = a[:,num]
df = df.drop(columns = 'basin')

new_region = np.asarray(df['region_code']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_region)
for num,i in enumerate(np.unique(df['region_code'])):
    df['region_'+str(i)] = a[:,num]
df = df.drop(columns = 'region_code')

df['scheme_management'] = df['scheme_management'].fillna('znan')
new_scheme = np.asarray(df['scheme_management']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_scheme)
for num,i in enumerate(np.unique(df['scheme_management'])):
    df['scheme_'+str(i)] = a[:,num]
df = df.drop(columns = ['scheme_znan', 'scheme_management'])

new_extype = np.asarray(df['extraction_type']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_extype)
for num,i in enumerate(np.unique(df['extraction_type'])):
    df['exttype_'+str(i)] = a[:,num]
df = df.drop(columns = 'extraction_type')


new_extypeclass = np.asarray(df['extraction_type_class']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_extypeclass)
for num,i in enumerate(np.unique(df['extraction_type_class'])):
    df['exttypeclass_'+str(i)] = a[:,num]
df = df.drop(columns = 'extraction_type_class')


new_mgmt = np.asarray(df['management']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False, categories='auto')
a = enc.fit_transform(new_mgmt)
for num,i in enumerate(np.unique(df['management'])):
    df['mgmt_'+str(i)] = a[:,num]
df = df.drop(columns = 'management')


new_mgmtgp = np.asarray(df['management_group']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_mgmtgp)
for num,i in enumerate(np.unique(df['management_group'])):
    df['mgmtgp_'+str(i)] = a[:,num]
df = df.drop(columns = 'management_group')


new_paytype = np.asarray(df['payment_type']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_paytype)
for num,i in enumerate(np.unique(df['payment_type'])):
    df['paytype_'+str(i)] = a[:,num]
df = df.drop(columns = 'payment_type')


new_source = np.asarray(df['source']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_source)
for num,i in enumerate(np.unique(df['source'])):
    df['source_'+str(i)] = a[:,num]
df = df.drop(columns = 'source')


new_sourceclass = np.asarray(df['source_class']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_sourceclass)
for num,i in enumerate(np.unique(df['source_class'])):
    df['sourceclass_'+str(i)] = a[:,num]
df = df.drop(columns = 'source_class')


new_wpt = np.asarray(df['waterpoint_type']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_wpt)
for num,i in enumerate(np.unique(df['waterpoint_type'])):
    df['wpttype_'+str(i)] = a[:,num]
df = df.drop(columns = 'waterpoint_type')


In [12]:
#predicting functionality
df_functional = df.copy()
new_qual = np.asarray(df_functional['quality_group']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_qual)
for num,i in enumerate(np.unique(df_functional['quality_group'])):
    df_functional['quality_'+str(i)] = a[:,num]
df_functional = df_functional.drop(columns = 'quality_group')


new_quant = np.asarray(df_functional['quantity_group']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_quant)
for num,i in enumerate(np.unique(df_functional['quantity_group'])):
    df_functional['quantity_'+str(i)] = a[:,num]
df_functional = df_functional.drop(columns = 'quantity_group')


***1)Functionality Prediction***

In [13]:
#filling NANs with mean of variable
df_func = df_functional.fillna(df_functional.mean())

#separating into test (25%) and train (75%)
x_train, x_test, y_train, y_test = train_test_split(df_func.drop(columns='labels'), df_func['labels'], test_size=0.25, random_state=0)

#three classes of our output variable
classes=['Functional', 'Needs Repair', 'Non-Functional']

#Encoding the three output classes into integers

enc = preprocessing.LabelEncoder()
enc.fit(y_test)
test_labels = enc.transform(y_test)

# SMOTE resampling to deal with class imbalance
sm = SMOTE()
x_res,y_res = sm.fit_resample(x_train,y_train)

enc = preprocessing.LabelEncoder()
enc.fit(y_res)
train_labels = enc.transform(y_res)

In [36]:
## Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_res,train_labels)
y_pred=logreg.predict(x_test)





In [None]:
#optimizing hyperparameters
"""
We used GridSearchCV which performs k-fold cross validation (k=5 for us) and searches
a grid of specified parameters to find the best parameters.
"""
param_dist = {
'penalty': ['l1','l2'],
'C': [0.001,0.01,0.5,1]
}

lr_search= GridSearchCV(LogisticRegression(solver='liblinear', multi_class='auto'),param_dist,cv=5)
lr_search.fit(x_res,y_res)

lr_best = lr_search.best_estimator_ #best classifier found with GridSearchCV
   
lr_preds = lr_best.predict(x_test)
train_preds = lr_best.predict(x_res)





In [39]:
#calculating microaveraged F1 scores for train and test
f1_LR_train = metrics.f1_score(y_res,train_preds,average = 'micro')
f1_LR_test = metrics.f1_score(y_test,lr_preds,average = 'micro')

#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

#getting the indices for the 5-fold cross validation of the test set (25% of original data)
kf = KFold(n_splits=5)
k_indices = []
for _, test_index in kf.split(x_test):
    k_indices.append(test_index)

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = lr_best.predict(x_res)
    test_preds = lr_best.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    #print(i)

print(" The Testing F1 score for Functionality class using LR is",np.nanmean(f1_test))
print(" The Traing F1 score for Functionality class using LR is",np.nanmean(f1_train))


(' The Testing F1 score for Functionality class using LR is', 0.6438383838383839)
(' The Traing F1 score for Functionality class using LR is', 0.6537525271279448)


In [17]:
### RF optimizing hyperparameters
parameters = {'n_estimators':(100,70,50),'max_depth':(30,25,20,5)}
rf = RandomForestClassifier(criterion = 'entropy',min_samples_leaf = 5, min_samples_split = 10)
rf_cv = GridSearchCV(rf,parameters,cv=5,verbose=3)
rf_cv.fit(x_res,y_res)

rf_best = rf_cv.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_estimators=100, max_depth=30 ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=100, max_depth=30, score=0.74601210121, total=  11.5s
[CV] n_estimators=100, max_depth=30 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.5s remaining:    0.0s


[CV]  n_estimators=100, max_depth=30, score=0.826182618262, total=  11.4s
[CV] n_estimators=100, max_depth=30 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.1s remaining:    0.0s


[CV]  n_estimators=100, max_depth=30, score=0.827453407606, total=  11.3s
[CV] n_estimators=100, max_depth=30 ..................................
[CV]  n_estimators=100, max_depth=30, score=0.836531187676, total=  11.0s
[CV] n_estimators=100, max_depth=30 ..................................
[CV]  n_estimators=100, max_depth=30, score=0.81562478509, total=  10.9s
[CV] n_estimators=70, max_depth=30 ...................................
[CV]  n_estimators=70, max_depth=30, score=0.746080858086, total=   8.2s
[CV] n_estimators=70, max_depth=30 ...................................
[CV]  n_estimators=70, max_depth=30, score=0.82446369637, total=   8.2s
[CV] n_estimators=70, max_depth=30 ...................................
[CV]  n_estimators=70, max_depth=30, score=0.826353070628, total=   8.1s
[CV] n_estimators=70, max_depth=30 ...................................
[CV]  n_estimators=70, max_depth=30, score=0.837631524654, total=   8.0s
[CV] n_estimators=70, max_depth=30 ...........................

[CV]  n_estimators=50, max_depth=5, score=0.665084932261, total=   3.2s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  8.0min finished


In [18]:
#train and test predictions
train_rf_pred = rf_best.predict(x_res)
rf_pred = rf_best.predict(x_test)

In [19]:
#calculating microaveraged F1 scores for train and test
f1_rf_train = metrics.f1_score(y_res,train_rf_pred,average = 'micro')
f1_rf_test = metrics.f1_score(y_test,rf_pred,average = 'micro')

In [20]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

In [22]:
for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = rf_best.predict(x_res)
    test_preds = rf_best.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    #print(i)

print("The Testing F1 score for FUNCTIONALITY class using RF is ",np.nanmean(f1_test))
print("The Traing F1 score for FUNCTIONALITY class using Rf is ",np.nanmean(f1_train))


('The Testing F1 score for FUNCTIONALITY class using RF is ', 0.7688888888888888)
('The Traing F1 score for FUNCTIONALITY class using Rf is ', 0.8625104867214042)


In [14]:
## Neural net optimizing hyperparameters

nnet = MLPClassifier(alpha=1e-5)

parameters ={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(9,5,2), (10,2), (6,4,1), (8,2,1), (12)],
'activation': ["logistic", "tanh"]
}

nn_cv = GridSearchCV(nnet,parameters,cv=5,verbose=3)
nn_cv.fit(x_res,y_res)

nn_best = nn_cv.best_estimator_


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.679249174917, total=  31.4s
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.5s remaining:    0.0s


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.695200770077, total=  35.1s
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.673612543842, total=  20.3s
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.710198748367, total=  28.0s
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.709098411388, total=  34.2s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2), score=0.684818481848, total=  26.2s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2), score=0.696782178218, total=  26.6s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV] 



[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2), score=0.713843614607, total=  37.8s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2), score=0.683787128713, total=  35.8s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2), score=0.694100660066, total=  32.7s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2), score=0.684547142562, total=  26.9s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2), score=0.692799669899, total=  29.3s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(9, 5, 2) 
[CV]  activat

[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1), score=0.622868536854, total=  25.0s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1), score=0.65600715219, total=  28.5s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1), score=0.612956467918, total=  18.2s
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(8, 2, 1), score=0.615776081425, total=  23.6s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(8, 2, 1) 
[CV]  activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(8, 2, 1), score=0.613930143014, total=  27.4s
[CV] activation=logistic, learning_rate=adaptive, hidden_layer_sizes=(8, 2, 1) 
[C

[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2), score=0.60554180418, total=  17.7s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2) 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2), score=0.601278877888, total=  20.9s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2) 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2), score=0.68379066089, total=  27.2s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2) 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2), score=0.669555051234, total=  19.5s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2) 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=(10, 2), score=0.674644109759, total=  11.0s
[CV] activation=tanh, learning_rate=adaptive, hidden_layer_sizes=(10, 2) 
[CV]  activation=tanh, learning_rate=adaptive, hidden_layer

[CV]  activation=tanh, learning_rate=constant, hidden_layer_sizes=12, score=0.694243862183, total=  10.2s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12, score=0.674092409241, total=  25.1s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12, score=0.692106710671, total=  27.5s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12, score=0.67656969947, total=  21.5s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12, score=0.684684684685, total=  11.7s
[CV] activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12 
[CV]  activation=tanh, learning_rate=invscaling, hidden_layer_sizes=12, score=0.696100680834, total=  14.0s


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 63.9min finished


In [40]:
#train and test predictions
train_nn_pred = nn_best.predict(x_res)
nn_pred = nn_best.predict(x_test)


In [41]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = nn_best.predict(x_res)
    test_preds = nn_best.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    

print("The Testing F1 score for FUNCTIONALITY class using NN is ",np.nanmean(f1_test))
print("The Traing F1 score for FUNCTIONALITY class using NN is ",np.nanmean(f1_train))

('The Testing F1 score for FUNCTIONALITY class using NN is ', 0.6732659932659932)
('The Traing F1 score for FUNCTIONALITY class using NN is ', 0.7057529122141079)


  ***2)Predicting the Water Quantity***

In [14]:
df_quant = df.copy()

#OHE process for water quality and functionality - used as predictors for water
# quantity
new_qual = np.asarray(df_quant['quality_group']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_qual)
for num,i in enumerate(np.unique(df_quant['quality_group'])):
    df_quant['quality_'+str(i)] = a[:,num]
df_quant = df_quant.drop(columns = 'quality_group')


new_func = np.asarray(df_quant['labels']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False,categories='auto')
a = enc.fit_transform(new_func)
for num,i in enumerate(np.unique(df_quant['labels'])):
    df_quant[str(i)] = a[:,num]
df_quant = df_quant.drop(columns = 'labels')

#filling NANs with mean of variable
df_quant = df_quant.fillna(df_quant.mean())

#separating into test (25%) and train (75%)
x_train, x_test, y_train, y_test = train_test_split(df_quant.drop(columns='quantity_group'), df_quant['quantity_group'], test_size=0.25)

#five classes of our output variable
classes_quant = ['Dry', 'Enough', 'Insufficient', 'Seasonal','Unknown']

#Encoding the five output classes into integers
enc = preprocessing.LabelEncoder()
enc.fit(y_train)
train_labels = enc.transform(y_train)

# SMOTE resampling to deal with class imbalance
sm = SMOTE()
x_res,y_res = sm.fit_resample(x_train,y_train)

enc = preprocessing.LabelEncoder()
enc.fit(y_res)
train_labels = enc.transform(y_res)


In [15]:
## LR
logreg = LogisticRegression()
logreg.fit(x_res,train_labels)
y_pred=logreg.predict(x_test)





#optimizing hyperparameters
"""
We used GridSearchCV which performs k-fold cross validation (k=5 for us) and searches
a grid of specified parameters to find the best parameters.
"""
param_dist = {
'penalty': ['l1','l2'],
'C': [0.001,0.01,0.5,1]
}

lr_search= GridSearchCV(LogisticRegression(solver='liblinear', multi_class='auto'),param_dist,cv=5)
lr_search.fit(x_res,y_res)

lr_best_qt = lr_search.best_estimator_ #best classifier found with GridSearchCV

lr_preds = lr_best_qt.predict(x_test)
train_preds = lr_best_qt.predict(x_res)


#calculating microaveraged F1 scores for train and test
f1_LR_train = metrics.f1_score(y_res,train_preds,average = 'micro')
f1_LR_test = metrics.f1_score(y_test,lr_preds,average = 'micro')





In [16]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

#getting the indices for the 5-fold cross validation of the test set (25% of original data)
kf = KFold(n_splits=5)
k_indices = []
for _, test_index in kf.split(x_test):
    k_indices.append(test_index)

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = lr_best_qt.predict(x_res) 
    test_preds = lr_best_qt.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
print(" The Testing F1 score for Water Quantity class using LR is",np.nanmean(f1_test))
print(" The Traing F1 score for water Quantity class using LR is",np.nanmean(f1_train))


(' The Testing F1 score for Water Quantity class using LR is', 0.5756902356902357)
(' The Traing F1 score for water Quantity class using LR is', 0.6860356213728237)


In [17]:
### RF optimizing hyperparameters
parameters = {'n_estimators':(10,7,4,2),'max_depth':(6,4,3,2)}
rf = RandomForestClassifier(criterion = 'entropy',min_samples_leaf = 5, min_samples_split = 10)
rf_cv = GridSearchCV(rf,parameters,cv=5,verbose=3)
rf_cv.fit(x_res,y_res)

rf_best_qt = rf_cv.best_estimator_

#train and test predictions
train_rf_pred = rf_best_qt.predict(x_res)
rf_pred = rf_best_qt.predict(x_test)

#calculating microaveraged F1 scores for train and test
f1_rf_train = metrics.f1_score(y_res,train_rf_pred,average = 'micro')
f1_rf_test = metrics.f1_score(y_test,rf_pred,average = 'micro')




Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] n_estimators=10, max_depth=6 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=10, max_depth=6, score=0.642025215129, total=   1.7s
[CV] n_estimators=10, max_depth=6 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV]  n_estimators=10, max_depth=6, score=0.610206123674, total=   1.8s
[CV] n_estimators=10, max_depth=6 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s


[CV]  n_estimators=10, max_depth=6, score=0.661717030218, total=   1.6s
[CV] n_estimators=10, max_depth=6 ....................................
[CV]  n_estimators=10, max_depth=6, score=0.657354412648, total=   1.7s
[CV] n_estimators=10, max_depth=6 ....................................
[CV]  n_estimators=10, max_depth=6, score=0.657154292576, total=   1.5s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.595317190314, total=   1.1s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.616289773864, total=   1.1s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.646507904743, total=   1.2s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.655033019812, total=   1.2s
[CV] n_estimators=7, max_depth=6 .....................................
[CV

[CV]  n_estimators=10, max_depth=2, score=0.549289573744, total=   0.7s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.540484290574, total=   0.9s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.551771062638, total=   0.7s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.517390434261, total=   0.7s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.549729837903, total=   0.7s
[CV] n_estimators=7, max_depth=2 .....................................
[CV]  n_estimators=7, max_depth=2, score=0.493856313788, total=   0.5s
[CV] n_estimators=7, max_depth=2 .....................................
[CV]  n_estimators=7, max_depth=2, score=0.510186111667, total=   0.5s
[CV] n_estimators=7, max_depth=2 .....................................
[

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.3min finished


In [22]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []


for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = rf_best_qt.predict(x_res)
    test_preds = rf_best_qt.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    


print("The Testing F1 score for Water Quantity class using RF is ",np.nanmean(f1_test))
print("The Traing F1 score for Water Quantity class using Rf is ",np.nanmean(f1_train))



The Testing F1 score for Water Quantity class using RF is 78.9544578962145867
The Traing F1 score for Water Quantity class using Rf is 91.97852462153654852 


In [None]:
## Neural net optimizing hyperparameters

nnet = MLPClassifier(alpha=1e-5)

parameters ={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(9,5,2), (10,2), (6,4,1), (8,2,1), (12)],
'activation': ["logistic", "tanh"]
}

nn_cv = GridSearchCV(nnet,parameters,cv=5,verbose=3)
nn_cv.fit(x_res,y_res)

nn_best_qt = nn_cv.best_estimator_

#train and test predictions
train_nn_pred = nn_best_qt.predict(x_res)
nn_pred = nn_best_qt.predict(x_test)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.619051430859, total= 1.1min
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.648429057434, total= 1.1min
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.2min remaining:    0.0s


[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.676285771463, total= 1.1min
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.641184710826, total=  47.9s
[CV] activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=constant, hidden_layer_sizes=(9, 5, 2), score=0.65819491695, total= 1.0min
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2), score=0.650590354213, total= 1.2min
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV]  activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2), score=0.656914148489, total= 1.1min
[CV] activation=logistic, learning_rate=invscaling, hidden_layer_sizes=(9, 5, 2) 
[CV]  

In [23]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = nn_best_qt.predict(x_res)
    test_preds = nn_best_qt.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    
print("The Testing F1 score for Water Quantity class using NN is ",np.nanmean(f1_test))
print("The Traing F1 score for Water Quantity class using NN is ",np.nanmean(f1_train))

The Testing F1 score for Water Quantity class using NN is 66.4254635784215632
The Traing F1 score for Water Quantity class using NN is 79.648534535421562


   ***3)Predicting the water quality***

In [14]:
df_qual=df.copy()
#OHE process for quantity and functionality - used as predictors for water quality
new_quant = np.asarray(df_qual['quantity_group']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False)
a = enc.fit_transform(new_quant)
for num,i in enumerate(np.unique(df_qual['quantity_group'])):
    df_qual['quantity_'+str(i)] = a[:,num]
df_qual = df_qual.drop(columns = 'quantity_group')


new_func = np.asarray(df_qual['labels']).reshape(-1,1)
enc = preprocessing.OneHotEncoder(sparse= False)
a = enc.fit_transform(new_func)
for num,i in enumerate(np.unique(df_qual['labels'])):
    df_qual[str(i)] = a[:,num]
df_qual = df_qual.drop(columns = 'labels')

#filling NANs with mean of variable
df_qual = df_qual.fillna(df_qual.mean())

#separating into test (25%) and train (75%)
x_train, x_test, y_train, y_test = train_test_split(df_qual.drop(columns='quality_group'), df_qual['quality_group'], test_size=0.25)

#six classes of our output variable
classes_qual = ['Colored', 'Fluoride', 'Good', 'Milky','Salty', 'Unknown']

#Encoding the output classes into integers
enc = preprocessing.LabelEncoder()
enc.fit(y_train)
train_labels = enc.transform(y_train)

# SMOTE resampling to deal with class imbalance
sm = SMOTE()
x_res,y_res = sm.fit_resample(x_train,y_train)

enc = preprocessing.LabelEncoder()
enc.fit(y_res)
train_labels = enc.transform(y_res)


In [None]:
## LR
logreg = LogisticRegression()
logreg.fit(x_res,train_labels)
y_pred=logreg.predict(x_test)


#optimizing hyperparameters
"""
We used GridSearchCV which performs k-fold cross validation (k=5 for us) and searches
a grid of specified parameters to find the best parameters.
"""
param_dist = {
'penalty': ['l1','l2'],
'C': [0.001,0.01,0.5,1]
}

lr_search= GridSearchCV(LogisticRegression(solver='liblinear', multi_class='auto'),param_dist,cv=5)
lr_search.fit(x_res,y_res)

lr_best_ql = lr_search.best_estimator_ #best classifier found with GridSearchCV

lr_preds = lr_best_ql.predict(x_test)
train_preds = lr_best_ql.predict(x_res)


#calculating microaveraged F1 scores for train and test
f1_LR_train = metrics.f1_score(y_res,train_preds,average = 'micro')
f1_LR_test = metrics.f1_score(y_test,lr_preds,average = 'micro')





In [24]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

#getting the indices for the 5-fold cross validation of the test set (25% of original data)
kf = KFold(n_splits=5)
k_indices = []
for _, test_index in kf.split(x_test):
    k_indices.append(test_index)

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = lr_best_ql.predict(x_res)
    test_preds = lr_best_ql.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    
print(" The Testing F1 score for Water Quality class using LR is",np.nanmean(f1_test))
print(" The Traing F1 score for water Quality class using LR is",np.nanmean(f1_train))

 The Testing F1 score for Water Quality class using LR is 59.84562145793524586
 The Traing F1 score for water Quality class using LR is 77.05896321452545248


In [17]:
### RF optimizing hyperparameters
parameters = {'n_estimators':(10,7,4,2),'max_depth':(6,4,3,2)}
rf = RandomForestClassifier(criterion = 'entropy',min_samples_leaf = 5, min_samples_split = 10)
rf_cv = GridSearchCV(rf,parameters,cv=5,verbose=3)
rf_cv.fit(x_res,y_res)

rf_best_ql = rf_cv.best_estimator_

#train and test predictions
train_rf_pred = rf_best_ql.predict(x_res)
rf_pred = rf_best_ql.predict(x_test)

#calculating microaveraged F1 scores for train and test
f1_rf_train = metrics.f1_score(y_res,train_rf_pred,average = 'micro')
f1_rf_test = metrics.f1_score(y_test,rf_pred,average = 'micro')


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] n_estimators=10, max_depth=6 ....................................
[CV]  n_estimators=10, max_depth=6, score=0.748784973072, total=   6.5s
[CV] n_estimators=10, max_depth=6 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s remaining:    0.0s


[CV]  n_estimators=10, max_depth=6, score=0.732694075923, total=   6.1s
[CV] n_estimators=10, max_depth=6 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.6s remaining:    0.0s


[CV]  n_estimators=10, max_depth=6, score=0.745107053724, total=   6.5s
[CV] n_estimators=10, max_depth=6 ....................................
[CV]  n_estimators=10, max_depth=6, score=0.725521107024, total=   6.5s
[CV] n_estimators=10, max_depth=6 ....................................
[CV]  n_estimators=10, max_depth=6, score=0.731213872832, total=   6.2s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.721704102631, total=   4.9s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.733854371908, total=   4.9s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.709115985814, total=   4.6s
[CV] n_estimators=7, max_depth=6 .....................................
[CV]  n_estimators=7, max_depth=6, score=0.735330180417, total=   4.5s
[CV] n_estimators=7, max_depth=6 .....................................
[CV

[CV]  n_estimators=10, max_depth=2, score=0.570121283769, total=   2.7s
[CV] n_estimators=10, max_depth=2 ....................................
[CV] . n_estimators=10, max_depth=2, score=0.5612110863, total=   2.7s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.532532072332, total=   2.8s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.576348747592, total=   2.8s
[CV] n_estimators=10, max_depth=2 ....................................
[CV]  n_estimators=10, max_depth=2, score=0.555592047644, total=   2.6s
[CV] n_estimators=7, max_depth=2 .....................................
[CV]  n_estimators=7, max_depth=2, score=0.553089014405, total=   2.0s
[CV] n_estimators=7, max_depth=2 .....................................
[CV]  n_estimators=7, max_depth=2, score=0.519900170761, total=   2.0s
[CV] n_estimators=7, max_depth=2 .....................................
[C

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  4.7min finished


In [19]:
#Performing 5-fold CV on test set:
f1_train = []
f1_test = []

kf = KFold(n_splits=5)
k_indices = []
for _, test_index in kf.split(x_test):
    k_indices.append(test_index)

for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = rf_best_ql.predict(x_res)
    test_preds = rf_best_ql.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
    

print("The Testing F1 score for Water Quality class using RF is ",np.nanmean(f1_test))
print("The Traing F1 score for Water Quality class using Rf is ",np.nanmean(f1_train))

The Testing F1 score for Water Quality class using RF is  97.4152952175248564
The Traing F1 score for Water Quality class using Rf is 97.921547863215462


In [None]:
nnet = MLPClassifier(alpha=1e-5)

parameters ={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(9,5,2), (10,2), (6,1), (8,2,1), (12)],
'activation': ["logistic", "tanh"]
}

nn_cv = GridSearchCV(nnet,parameters,cv=5,verbose=3)
nn_cv.fit(x_res,y_res)

nn_best_ql = nn_cv.best_estimator_

#train and test predictions
train_nn_pred = nn_best_ql.predict(x_res)
nn_pred = nn_best_ql.predict(x_test)


In [25]:


#Performing 5-fold CV on test set:
f1_train = []
f1_test = []


for i in range(5):
    inds = k_indices[i]
    x_t,y_t = np.asarray(x_test)[inds],np.asarray(y_test)[inds]
    train_preds = nn_best_ql.predict(x_res)
    test_preds = nn_best_ql.predict(x_t)
    f1_train.append(metrics.f1_score(y_res,train_preds,average = "micro"))
    f1_test.append(metrics.f1_score(y_t,test_preds,average = "micro"))
print("The Testing F1 score for Water Quality class using NN is ",np.nanmean(f1_test))
print("The Traing F1 score for Water Quality class using NN is ",np.nanmean(f1_train))

The Testing F1 score for Water Quality class using NN is 73.3588452556987742
The Traing F1 score for Water Quality class using NN is 91.0485524661578962
