In [1087]:
import pandas as pd
import math
import random


In [1088]:
df = pd.read_csv('pima-indians-diabetes.csv')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [1089]:
train = df.sample(frac=0.8, random_state=random.randint(1, 768))
test  = df.drop(train.index)
y_test = test['8']
X_test = test.drop(['8'], axis=1)


In [1090]:
def mean(numbers):
    return sum(numbers) /    float(len(numbers))

In [1091]:
def standard_deviation(numbers):
    avg = mean(numbers)
    variance = 0.0
    for number in numbers:
        variance += pow(number - avg, 2) / (float(len(numbers)) - 1)
    return math.sqrt(variance)

In [1092]:
def prob(x, mean, stdev):
    
    expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

In [1093]:
def fit(X, y):
    summaries = {}
    for label in y.unique():
        summaries[label] = []
        for column in X.columns:
            summaries[label].append((mean(X[y == label][column]), standard_deviation(X[y == label][column])))
    
    return summaries

In [1094]:
def getMax(probabilities):
    return max(probabilities, key=probabilities.get)


In [1095]:
def predict(model, X_predict):
    X_predict = X_predict.values
    result = []
    
    for i in X_predict:
        probabilities = {}
        for label in model:
            probabilities[label] = 1
            for index, value in enumerate(i):
                mean, stdev = model[label][index]
                probabilities[label] *= prob(float(value), mean, stdev) 
        
        result.append(getMax(probabilities))
    return result

In [1096]:
def accuracy_score(y_pred, y_test):
    count = 0
    for index, value in enumerate(y_pred):
        if(value == y_test[index]):
            count += 1
    return count/len(y_pred)
        

In [1097]:
X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]


model = fit(X_train, y_train)
print(X_test)
y_pred = predict(model, X_test)
print(y_pred)
accuracy = accuracy_score(y_pred, y_test.values)
print("Độ chính xác {}".format(accuracy))

      0    1   2   3    4     5      6   7
9     8  125  96   0    0   0.0  0.232  54
19    1  115  70  30   96  34.6  0.529  32
24   11  143  94  33  146  36.6  0.254  51
37    9  102  76  37    0  32.9  0.665  46
39    4  111  72  47  207  37.1  1.390  56
..   ..  ...  ..  ..  ...   ...    ...  ..
754   8  154  78  32    0  32.4  0.443  45
757   0  123  72   0    0  36.3  0.258  52
759   6  190  92   0    0  35.5  0.278  66
761   9  170  74  31    0  44.0  0.403  43
765   5  121  72  23  112  26.2  0.245  30

[154 rows x 8 columns]
[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 