In [None]:
#Google Tesnor flow 
#Used Tensorflow to create a neural net and then using the neural net to predict 
#Whether or not the sample of water is Potable or Non-Potable 
#%pip install -U imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow_hub as hub
from imblearn.over_sampling import RandomOverSampler

data=pd.read_csv('water_potability.csv')
data.head(10)
## null value check
data.isnull().sum()

data.shape
100*data.isnull().sum()/len(data)

data_dropped = data.dropna()
print("The total missing value accounts", 100-round(data_dropped.shape[0]/data.shape[0]*100), "% of the total data")
#It might not be a good idea to drop all the missing value columns. 
#Let's continue exploring the dataset and then deal with these missing values.

data[data['Potability']==0].describe()
data[data['Potability']==0][['ph','Sulfate','Trihalomethanes']].median()
#The difference between mean and median values of non-potable water is small.

#Let's continue to see the result for potable water
data[data['Potability']==1].describe()

data[data['Potability']==1][['ph','Sulfate','Trihalomethanes']].median()
#The difference between mean and median values of potable water is also small.

#We can use the overall median of the feature to impute values.
data['ph'].fillna(value=data['ph'].median(), inplace=True)
data['Sulfate'].fillna(value=data['Sulfate'].median(), inplace=True)
data['Trihalomethanes'].fillna(value=data['Trihalomethanes'].median(), inplace=True)

data.info()
## null value check
data.isnull().sum()
data.describe()
#Check our target variable 'Potability'
100*data['Potability'].value_counts(normalize=True)
data.columns
len(data[data['Potability']==1]),len(data[data['Potability']==0])
correlation = data.corr()
print(correlation['Potability'].sort_values(ascending = False),'\n')

trace=go.Histogram(
    x= data.Potability,
    opacity = 0.75,
    name = "Water Quality",
    marker = dict(color = 'blue'))
hist_data=[trace]
hist_layout=go.Layout(barmode='overlay',bargap=0.75,
            title='Water Quality',
            xaxis=dict(title='Not potable & Potable'),
            yaxis=dict(title='Frequency'),)
fig=go.Figure(data=hist_data,layout=hist_layout)
iplot(fig)

X=data[data.columns[:-1]].values
y=data[data.columns[-1]].values

X.shape,y.shape

scaler=StandardScaler()
X=scaler.fit_transform(X)
dp=np.hstack((X,np.reshape(y,(-1,1))))
transformed_data=pd.DataFrame(dp,columns=data.columns)
X

#RANDOM OVER SAMPLER
over=RandomOverSampler()
X,y =over.fit_resample(X,y)
dp=np.hstack((X,np.reshape(y,(-1,1))))
transformed_data=pd.DataFrame(dp,columns=data.columns)

len(transformed_data[transformed_data["Potability"]==1]),len(transformed_data[transformed_data["Potability"]==0])

X_train,X_temp,y_train,y_temp = train_test_split(X,y,test_size=0.4,random_state=0)
X_valid,X_test,y_valid,y_test = train_test_split(X_temp,y_temp,test_size=0.5,random_state=0)

model = tf.keras.Sequential([
                             tf.keras.layers.Dense(16,activation='relu'), # if x<=0-->0, x>0 -->x
                             tf.keras.layers.Dense(16,activation='relu'),
                             tf.keras.layers.Dense(1,activation="sigmoid")                       
                            ])



model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])


model.evaluate(X_train,y_train)
model.evaluate(X_valid,y_valid)
model.fit(X_train,y_train,batch_size=18,epochs=20, validation_data=(X_valid,y_valid))
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
nn_results=model.evaluate(X_test,y_test)
print("test loss, test acc:", nn_results)
nn_ac=nn_results[1]

#RANDOM FOREST MODEL
# Split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6, random_state = 0,stratify=y)

from sklearn.ensemble import RandomForestClassifier
rf_classificationModel=RandomForestClassifier(n_estimators =100,min_samples_leaf =1, random_state = 0)
Y_train=rf_classificationModel.fit(X_train,y_train)
#Y_train=rf_classificationModel.fit(X_train,y_train)

#Y_train=rf_classificationModel.fit(X_train,y_train)
Y_pred = rf_classificationModel.predict(X_test)

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
RmFor_acc = accuracy_score(y_test, Y_pred)
print('Accuracy Score',RmFor_acc)
#print('Accuracy Score', metrics.accuracy_score(y_test,Y_pred))

# View the classification report for test data and predictions
# checking mean_squared error
MSE = mean_squared_error(y_test,Y_pred) 

# checking root mean squared error
RMSE = np.sqrt(MSE)
print('mean squared error is : ',MSE) 
print('root mean squared error is : ',RMSE)
print(classification_report(y_test, Y_pred))


#SVM
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
# create the model
SVM = SVC(kernel ='rbf', random_state = 0)
# model training
SVM.fit(X_train, y_train)
# prediction
SVM_pred = SVM.predict(X_test)
# accuracy
SVM_acc = accuracy_score(y_test, SVM_pred)
SVM_acc
print("The accuracy for SVM is", SVM_acc)
print("The classification report using SVM is:", SVM_acc)
print(classification_report(y_test, SVM_pred))


models = pd.DataFrame({
    'Model':['Neural Network','Random Forest','SVM'],
    'Accuracy' :[nn_ac,RmFor_acc,SVM_acc ]
})
models.sort_values(by='Accuracy', ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(x='Model', y='Accuracy', data = models, 
            order = models.sort_values("Accuracy").Model,
           palette = 'Blues_d')



