In [35]:
import math
from pathlib import Path
import numpy as np
import pandas as pd
import scipy as sc
from scipy.stats import multivariate_normal

In [36]:
def read_csv_file_data(path_to_file, filename) -> pd.DataFrame:
    """Function to read file """
    data_directory = Path(path_to_file)
    file_to_read = data_directory / filename
    data_frame_temp = pd.read_csv(file_to_read)
    return data_frame_temp

In [37]:
def normal_pdf(x, mean, sd):
    """Calculate the normal distribution"""
    var = float(sd) ** 2
    denominator = (2 * math.pi * var) ** .5
    num = math.exp(-(float(x) - float(mean)) ** 2 / (2 * var))
    return (num / denominator)

In [38]:
def calculate_standard_deviation(data_frame_temp: pd.DataFrame, column_count) -> []:
    """Calculate the standard deviation of database"""
    standard_deviation_array = [0 for i in range(column_count)]
    col_counter = 0
    for (columnName, columnData) in data_frame_temp.iteritems():
        standard_deviation_array[col_counter] = data_frame_temp[columnName].std()
        col_counter += 1
    return standard_deviation_array

In [39]:
def calculate_mean(data_frame_temp: pd.DataFrame, column_count) -> []:
    """Calculate the mean of database"""
    mean_array_temp = [0 for i in range(column_count)]
    col_counter = 0
    for (columnName, columnData) in data_frame_temp.iteritems():
        mean_array_temp[col_counter] = data_frame_temp[columnName].mean()
        col_counter += 1
    return mean_array_temp

In [40]:
def get_rows_with_category_index_value(data_frame_temp: pd.DataFrame, category_index, category_value):
    """Differentiate the database on the basis of o and 1"""
    condition_bool = data_frame_temp.iloc[:, category_index] == category_value
    result_rows = data_frame_temp[condition_bool]
    return result_rows

In [41]:
def get_training_data(data_frame_temp: pd.DataFrame, percentage_of_data: float):
    """This method is used to create traing data from given data frame"""
    result_train_data = data_frame_temp.sample(frac=percentage_of_data)
    return result_train_data

In [42]:
def calculate_prediction_array(data_frame_temp: pd.DataFrame, mean_zero: [], std_zero: [], mean_one: [], std_one: []):
    """This method is used to calculate the normal distribution"""
    row_count_temp = data_frame_temp.shape[0]
    col_count_temp = data_frame_temp.shape[1]-1 # 8 # can be replaced by std_ or mean array length
    result_predict = []

    for row_index in range(row_count_temp):
        out_come_zero = 1
        out_come_one = 1
        for col_index in range(col_count_temp):
            x = data_frame_temp.iloc[row_index, col_index]
        # noramal distrubution for zero
            out_come_zero = out_come_zero * normal_pdf(x, mean_zero[col_index], std_zero[col_index])
        # noramal distrubution for one
            out_come_one = out_come_one * normal_pdf(x, mean_one[col_index], std_one[col_index])
        if out_come_zero > out_come_one:
            result_predict.append(0)
        else:
            result_predict.append(1)

    return result_predict

In [43]:
# To verify the result

def calculate_prediction_array02(data_frame_temp: pd.DataFrame, mean_zero: [], std_zero: [], mean_one: [], std_one: []):
    row_count_temp = data_frame_temp.shape[0]
    col_count_temp = data_frame_temp.shape[1] - 1  # 8 # can be replaced by std_ or mean array length
    result_predict = []

    for row_index in range(row_count_temp):
        out_come_zero = 1
        out_come_one = 1
        for col_index in range(col_count_temp):
            x = data_frame_temp.iloc[row_index, col_index]
            # noramal distrubution for zero
            out_come_zero = out_come_zero * sc.stats.norm(mean_zero[col_index], std_zero[col_index]).pdf(x)
            # noramal distrubution for one
            out_come_one = out_come_one * sc.stats.norm(mean_one[col_index], std_one[col_index]).pdf(x)
        if out_come_zero > out_come_one:
            result_predict.append(0)
        else:
            result_predict.append(1)

    return result_predict

In [44]:
def generate_accuracy(actual, predicted):
    '''Generate Accuracy of the database'''
    data = {'y_Actual': actual,
            'y_Predicted': predicted
            }
    df = pd.DataFrame(data, columns=['y_Actual', 'y_Predicted'])
    confusion_matrix01 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    # confusion_matrix02 = ConfusionMatrix(df['y_Actual'], df['y_Predicted'])
    # print(confusion_matrix01)
    # CM = confusion_matrix(y_true, y_pred)

    TN = confusion_matrix01[0][0]
    FN = confusion_matrix01[1][0]
    TP = confusion_matrix01[1][1]
    FP = confusion_matrix01[0][1]
    print("TN : ", TN)
    print("FN : ", FN)
    print("TP : ", TP)
    print("FP : ", FP)
    
    specificity=TN/(TN+FP)
    
    print("Confusion Matrix Accuracy : ", (TP + TN) / (TP + FP + TN + FN)*100)
    print("Error : ", (FN + FP) / (TP + FP + TN + FN))
    
    print("Sensitivity : ",(TP / ( FN + TP ))* 100)
    
    print("Specificity : ",(TN / ( TN + FP ))* 100)
    
    # confusion_matrix02.print_stats()
    # sn.heatmap(confusion_matrix, annot=True)
    #     # plt.show()
    accurate_matches = 0
    for index in range(len(actual)):
        if actual[index] == predicted[index]:
            accurate_matches += 1
    return (accurate_matches/float(len(actual))) * 100.0

In [45]:
#Starting Point!
# Read the files
training_data_frame = read_csv_file_data("D:/Subjects/Spring 20/Pattern/Project", "train.csv")
testing_data_frame = read_csv_file_data("D:/Subjects/Spring 20/Pattern/Project", "test.csv")

row_count = training_data_frame.shape[0]
col_count = training_data_frame.shape[1]
out_come_column_index = 8

In [46]:
#train_data_frame = get_training_data(training_data_frame, .30)

non_diabetic_data = get_rows_with_category_index_value(training_data_frame, out_come_column_index, 0)
diabetic_data = get_rows_with_category_index_value(training_data_frame, out_come_column_index, 1)

In [47]:
# Calculate the Standard Deviation and Mean for Non_Diabetic patients

std_dev_non_diabetic = calculate_standard_deviation(non_diabetic_data.iloc[:, :-1], col_count-1)
mean_array_non_diabetic = calculate_mean(non_diabetic_data.iloc[:, :-1], col_count-1)

In [48]:
# Calculate the Standard Deviation and Mean for Diabetic patients

std_dev_diabetic = calculate_standard_deviation(diabetic_data.iloc[:, :-1], col_count-1)
mean_array_diabetic = calculate_mean(diabetic_data.iloc[:, :-1], col_count-1)

In [31]:
# Calculating the Prediction Array

prediction_Array = calculate_prediction_array(testing_data_frame, mean_array_non_diabetic, std_dev_non_diabetic, mean_array_diabetic, std_dev_diabetic)

# Calculating the Accuracy Array

accuracy = generate_accuracy(actual = testing_data_frame.iloc[:,8].tolist(), predicted = prediction_Array)
print("Accuracy : ", accuracy)

TN :  119
FN :  40
TP :  68
FP :  26
Confusion Matrix Accuracy :  73.91304347826086
Error :  0.2608695652173913
Sensitivity :  62.96296296296296
Specificity :  82.06896551724138
Accuracy :  73.91304347826086
