<a href="https://colab.research.google.com/github/meghmodi2810/Machine-Learning-Projects/blob/main/NaiveBayesRawDiabetesDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [206]:
import csv
import random
import math

In [207]:
def load_csv(filename):
  lines = csv.reader(open(filename, mode="r"))
  next(lines)
  dataset = list(lines)
  for i in range(len(dataset)):
    dataset[i] = [float(x) for x in dataset[i]]
  return dataset

In [208]:
def split_dataset(dataset, split_ratio):
  train_size = int(len(dataset) * split_ratio)
  train_set = []
  copy = list(dataset)
  while(len(train_set) < train_size):
    index = random.randrange(len(copy))
    train_set.append(copy.pop(index))
  return [train_set, copy]

In [209]:
def separate_class(dataset):
  seprated = {}
  for i in range(len(dataset)):
    vector = dataset[i]
    if(vector[-1] not in seprated):
      seprated[vector[-1]] = []
    seprated[vector[-1]].append(vector)
  return seprated

In [210]:
def mean(num):
  return sum(num)/float(len(num))

In [211]:
def standard_deviation(num):
  avg = mean(num)
  variance = sum([pow(x-avg, 2) for x in num])/float(len(num)-1)
  return math.sqrt(variance)

In [212]:
def summarize(dataset):
  summaries = [(mean(attribute), standard_deviation(attribute)) for attribute in zip(*dataset)[:-1]]
  return summaries

In [213]:
def summarize_class(dataset):
  separated = separate_class(dataset)
  summaries = {}
  for classValue, instances in separated.items():
    attributes = list(zip(*instances))[:-1]  # Convert zip to list to allow slicing
    summaries[classValue] = [(mean(attribute), standard_deviation(attribute)) for attribute in attributes]
  return summaries

In [214]:
def calculate_probability(x, mean, stdev):
  exponent = math.exp(-math.pow(x-mean, 2)/(2*math.pow(stdev, 2)))
  return (1/(math.sqrt(2*math.pi)*stdev))*exponent

In [215]:
def calculate_class_probability(summaries, input_vector):
  probabilities = {}
  for classValue, classSummaries in summaries.items():
    probabilities[classValue] = 1
    for i in range(len(classSummaries)):
      mean, stdev = classSummaries[i]
      x = input_vector[i]
      probabilities[classValue]*= calculate_probability(x, mean, stdev)
  return probabilities

In [216]:
def predict(summaries, input_vector):
  probabilities = calculate_class_probability(summaries, input_vector)
  bestLabel, BestProb = None, -1
  for classValue, probability in probabilities.items():
    if bestLabel is None or probability > BestProb:
      bestLabel = classValue
      BestProb = probability
  return bestLabel

In [217]:
def get_predictions(summaries, test_set):
  predictions = []
  for i in range(len(test_set)):
    result = predict(summaries, test_set[i])
    predictions.append(result)
  return predictions

In [218]:
def get_accuracy(test_set, predictions):
  correct = 0
  for x in range(len(test_set)):
    if test_set[x][-1] == predictions[x]:
      correct+=1
  return (correct/float(len(test_set))) * 100.0

In [219]:
def main():
  ## define filename, split_ratio and load csv
  filename = "sample_data/diabetes.csv"
  split_ratio = 0.67
  dataset = load_csv(filename)

  ## split the actual dataset into train and test model
  training_set, test_set = split_dataset(dataset, split_ratio)
  print(f"Splitted {len(dataset)} into train = {len(training_set)} and test = {len(test_set)}")

  ## prepare model for summarizations
  summaries = summarize_class(training_set)

  ## get the predictions and calculate the accuracy
  predictions = get_predictions(summaries, test_set)
  accuracy = get_accuracy(test_set, predictions)

  print(f"Accuracy : {accuracy : .2f}")

In [220]:
main()

Splitted 768 into train = 514 and test = 254
Accuracy :  73.62
