In [95]:
import pandas as pd
import math

In [96]:
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [97]:
def calculate_mean(numbers) -> float:
    return sum(numbers) / float(len(numbers))
    # (1+2+3+4+5)/5

In [98]:
calculate_mean([1,2,3,4,5])

3.0

In [99]:
def calculate_variance(numbers) -> float:
    avg = calculate_mean(numbers)
    result = 0.0
    for number in numbers: 
        result += pow(number - avg, 2) / (float(len(numbers)) - 1)
    return result

In [100]:
calculate_variance([1,2,3,4,5])

2.5

In [101]:
def standard_deviation(numbers) -> float:
    variance = calculate_variance(numbers)
    return math.sqrt(variance)  

In [102]:
standard_deviation([1,2,3,4,5])

1.5811388300841898

In [103]:
def fit(X, y) -> dict:
    result = {} 
    for label in y.unique():
        result[label] = []
        for column in X.columns:
            mean = calculate_mean(X[y == label][column])
            stdev = standard_deviation(X[y == label][column])
            result[label].append(
                (mean, stdev)
            )
    return result 

In [104]:
def calculate_probability_density(x, mean, stdev) -> float:
    
    expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

In [105]:
def getMax(probabilities):
    return max(probabilities, key=probabilities.get)


In [106]:
def predict(model, y_train, X_test) -> list:
    X_test = X_test.values
    result = []
    len_y_train = len(y_train)
    y_value_counts = y_train.value_counts()
    for i in X_test:
        probabilities = {}
        for label in model:
            probabilities[label] = 1
            
            for index, value in enumerate(i):
                mean, stdev = model[label][index]
                probabilities[label] *= calculate_probability_density(float(value), mean, stdev) 

            probabilities[label] *= y_value_counts[label] / len_y_train 
            # 1998/2184
        result.append(getMax(probabilities))
    return result

In [107]:
def accuracy_score(y_pred, y_test):
    count = 0
    for index, value in enumerate(y_pred):
        if(value == y_test[index]):
            count += 1
    return count/len(y_pred)
        

In [108]:
df = pd.read_csv('water_potability.csv')
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [109]:
df["Trihalomethanes"].fillna(value=df["Trihalomethanes"].mean(), inplace=True)
df["Sulfate"].fillna(value=df["Sulfate"].mean(), inplace=True)
df['ph'].fillna(value=df['ph'].mean(), inplace=True)

In [110]:
calculate_mean(df[df['Potability'] == 1].ph)

7.074754331355026

In [111]:
df[df['Potability'] == 1].describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,1278.0,1278.0,1278.0,1278.0,1278.0,1278.0,1278.0,1278.0,1278.0,1278.0
mean,7.074754,195.800744,22383.991018,7.169338,332.844122,425.3838,14.160893,66.533513,3.968328,1.0
std,1.343955,35.547041,9101.010208,1.702988,41.868471,82.048446,3.263907,15.971968,0.780842,0.0
min,0.227499,47.432,728.75083,0.352,129.0,201.619737,2.2,8.175876,1.492207,1.0
25%,6.351824,174.330531,15668.985035,6.094134,313.052947,360.939023,12.033897,56.911186,3.430909,1.0
50%,7.080795,196.632907,21199.386614,7.215163,333.775777,420.712729,14.162809,66.396293,3.958576,1.0
75%,7.780068,218.00342,27973.236446,8.199261,354.807924,484.155911,16.356245,77.067457,4.509569,1.0
max,13.175402,323.124,56488.672413,13.127,481.030642,695.369528,23.604298,124.0,6.494249,1.0


In [114]:
accuracy_bayes = []
accuracy_decision = []
for i in range(0, 10, 1):

    train = df.sample(frac=2/3)
    test  = df.drop(train.index)
    train.head(5) 
    X_train = train.drop(['Potability'], axis = 1)
    y_train = train['Potability']

    X_test = test.drop(['Potability'], axis = 1)
    y_test = test['Potability']

    
    model = fit(X_train, y_train)
    y_pred =  predict(model, y_train, X_test)
    accuracy_bayes.append(accuracy_score(y_pred, y_test.values))
    print("Độ chính xác thuật bayes {}".format(accuracy_score(y_pred, y_test.values)))

    model_tree = DecisionTreeClassifier(criterion="entropy")
    model_tree.fit(X_train, y_train)
    y_pred_tree = model_tree.predict(X_test)
    accuracy_bayes.append(accuracy_score(y_test, y_pred_tree))
    print("Độ chính xác thuật decision tree {}".format(accuracy_score(y_test, y_pred_tree)))

Độ chính xác thuật bayes 0.6135531135531136
Độ chính xác thuật decision tree 0.5805860805860806
Độ chính xác thuật bayes 0.6190476190476191
Độ chính xác thuật decision tree 0.5961538461538461
Độ chính xác thuật bayes 0.6263736263736264
Độ chính xác thuật decision tree 0.5851648351648352
Độ chính xác thuật bayes 0.6071428571428571
Độ chính xác thuật decision tree 0.5970695970695971
Độ chính xác thuật bayes 0.6437728937728938
Độ chính xác thuật decision tree 0.5824175824175825
Độ chính xác thuật bayes 0.6043956043956044
Độ chính xác thuật decision tree 0.5842490842490843
Độ chính xác thuật bayes 0.6254578754578755
Độ chính xác thuật decision tree 0.5842490842490843
Độ chính xác thuật bayes 0.61996336996337
Độ chính xác thuật decision tree 0.5741758241758241
Độ chính xác thuật bayes 0.6025641025641025
Độ chính xác thuật decision tree 0.5686813186813187
Độ chính xác thuật bayes 0.597985347985348
Độ chính xác thuật decision tree 0.6071428571428571
