<a href="https://colab.research.google.com/github/moyeed/number_recognition_naive_bayes/blob/main/hand_writing_recogntion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This block downloads Files from absolute Path and create a local copy



In [10]:
import os
import json
from urllib.request import urlretrieve
from collections import defaultdict
import numpy as np
from math import log

files_to_download = {"training_images.txt":"https://raw.githubusercontent.com/moyeed/number_recognition_naive_bayes/main/trainingimages.txt",
                     "training_label.txt":"https://raw.githubusercontent.com/moyeed/number_recognition_naive_bayes/main/traininglabels.txt",
                     "testing_images.txt":"https://raw.githubusercontent.com/moyeed/number_recognition_naive_bayes/main/testimages.txt",
                     "testing_label.txt":"https://raw.githubusercontent.com/moyeed/number_recognition_naive_bayes/main/testlabels.txt"}

for fileName, url in files_to_download.items():
  if not os.path.exists(fileName):
    urlretrieve(url, fileName)




#The below block defines the helper functions that will allow us to train and predict the values from the given data

In [16]:
#This function extracts the images from the text file and convertes them to a matrix of 28*28 to perform actions.
def parse_files(images_file_path, label_file_path,test_file = False):
  labels_list = []
  # this dict stores the numbers (0 to 9) as keys and values will be list of all the matrices of each number. {0:[i1,i2,i3..],1:[i1,i2,i3..],2:[i1,i2,i3..],...} 
  label_img_collection = defaultdict(list)
  # this dict stores all the test images with respective to its index value to map it later with labels. =>{0:img1_on_file,1:img2_on_file......,4999:lastimg_on_file}
  index_img_collection = {}
  with open(label_file_path) as file:
      labels_list = [int(line.rstrip()) for line in file.readlines()]

  px_mat = []
  line_num = 1
  with open(images_file_path) as file:
    for line in file.readlines():
      each_line =  [ (1 if ((each_char == "+") or (each_char == "#")) else 0) for each_char in line if each_char != "\n"]
      if(len(each_line)!=28):
        raise Exception("len is greater check !!")
      px_mat.append(each_line)
      if (line_num %28 == 0):
        if(test_file):
          index_img_collection[(line_num//28)-1] = px_mat
        else:
          label_img_collection[labels_list[(line_num//28)-1]].append(np.array(px_mat))
        if(len(px_mat) !=28):
          print(line_num,len(px_mat),len(px_mat[0]))
          raise Exception("len is grater")
        px_mat = []
      line_num+=1

    if (test_file):
      return index_img_collection
    else:
      return label_img_collection

#This function will help improve the accuracy by making the 0 terms have ittle proability other than zero.
# we know, posterior prob of each class = (num +k)/(den+(k*no of features))
# k will be a smallest avlue that can make th zero terms non zero, the v or number of features are two beause we only have two values 0 or 1
def laplace_smoothing(num,den,k=0.1,v=2):
  return (num[0]+k)/(den+(k*v)),(num[1]+k)/(den+(k*v))

#This calculates the posterior and prior probabiities that can be used to create a model
def calculate_training_probs(label_img_collection):
  #stores the len of items occured in past for each number  and also the prior probabilitties of each number.
  #{0:{"len":len,"prior_prob":prior_prob},1:{"len":len,"prior_prob":prior_prob},...,9:{"len":len,"prior_prob":prior_prob}}
  prior_len_probs = defaultdict(list)
  #This variable stores the liklihood of each number as follwing dictionary
  #{0:v0,1:v1,....,9:v9}
  #where, vi = {(0,0):List(0,0),(0,1):List(0,0),.....,(28,28):List(0,0)}
  #where, List(0,0) represenst the probability of having 0 at that px position and 1 at that pixel position in 0 and 1 index of list respectively.
  likelihood_probs = defaultdict(lambda:defaultdict(list))
  total_elements = 0

  #calculates only the length of the each number occured in traing data
  for each_class, px_mat_arr in label_img_collection.items():
    prior_len_probs[each_class] = [len(px_mat_arr),0]
    total_elements += len(px_mat_arr)

  #calculates the prior probabilities and stores them in proper format.
  prior_len_probs = {cls:({"len":list_len_probs[0],"prior_prob":(list_len_probs[0]/total_elements)}) for cls, list_len_probs in prior_len_probs.items()}  

  #calculates the likelihood dictionary as described above at varibale declaration location.
  for each_class,px_mat_arr in label_img_collection.items():
    features_prob = None
    features_prob = defaultdict(lambda:[0,0])
    for each_mat in px_mat_arr:
      for rows in range(len(each_mat)):
        for columns in range(len(each_mat[0])):
          features_prob_list = features_prob[(rows,columns)]
          features_prob[(rows,columns)] = [features_prob_list[0]+1,features_prob_list[1]] if each_mat[rows][columns] == 0 else [features_prob_list[0],features_prob_list[1]+1]
    if(len(features_prob) !=784):
      raise Exception("error")
    likelihood_probs[each_class] = features_prob
  print(likelihood_probs)

  for each_class, features_counts in likelihood_probs.items():
    likelihood_probs[each_class] = {px_loc:list(laplace_smoothing(counts,prior_len_probs[each_class]["len"])) for px_loc,counts in features_counts.items()}

  return prior_len_probs, likelihood_probs

#This function uses all the image data and condtructs the naive bayees model.
def train_naive_bayes_classifier(training_images_file_path = "/content/training_images.txt",training_labels_file_path = "/content/training_label.txt",label_img_data = None):
  #get the training data from files
  if(label_img_data == None):
    label_img_data = parse_files(training_images_file_path,training_labels_file_path)
  #get the probablities
  prior_probs,likelihood = calculate_training_probs(label_img_data)
  return prior_probs,likelihood

#This function predicts the given test data images.
def predict(prior_probs,likelihood,test_images_file_path = "/content/testing_images.txt", test_labels_file_path = "/content/testing_label.txt"):
  index_img_data = parse_files(test_images_file_path, test_labels_file_path, test_file = True)
  predction_dict = defaultdict(list)
  for index, img_mat in index_img_data.items():
    prob_dict = {num: prior_probs[num]["prior_prob"] for num in range(0,10)}
    max_value = None
    max_prob = float("-inf")
    for row in range(len(img_mat)):
      for column in range(len(img_mat[0])):
        for possible_num in range(0,10):
          prob_dict[possible_num] = prob_dict[possible_num]+ (log(likelihood[possible_num][(row,column)][0]) if img_mat[row][column] == 0 else log(likelihood[possible_num][(row,column)][1]))

    for num,prob in prob_dict.items():
        if(prob > max_prob):
          max_prob = prob
          max_value = num
    predction_dict[index] = max_value
  return predction_dict

def compute_confusion_matrix(actual_data, prediction):
    # Number of classes 
    len_mat = len(np.unique(actual_data)) 
    #result matrix
    result = np.zeros((len_mat, len_mat),dtype=int) 

    for index in range(len(actual_data)):
      result[actual_data[index]][prediction[index]] += 1

    return result

#This block calls the helper functions to create a naie bayees model

In [17]:
model = train_naive_bayes_classifier()

defaultdict(<function calculate_training_probs.<locals>.<lambda> at 0x7f6425fce560>, {5: defaultdict(<function calculate_training_probs.<locals>.<lambda> at 0x7f6425fcecb0>, {(0, 0): [434, 0], (0, 1): [434, 0], (0, 2): [434, 0], (0, 3): [434, 0], (0, 4): [434, 0], (0, 5): [434, 0], (0, 6): [434, 0], (0, 7): [434, 0], (0, 8): [434, 0], (0, 9): [434, 0], (0, 10): [434, 0], (0, 11): [434, 0], (0, 12): [434, 0], (0, 13): [434, 0], (0, 14): [434, 0], (0, 15): [434, 0], (0, 16): [434, 0], (0, 17): [434, 0], (0, 18): [434, 0], (0, 19): [434, 0], (0, 20): [434, 0], (0, 21): [434, 0], (0, 22): [434, 0], (0, 23): [434, 0], (0, 24): [434, 0], (0, 25): [434, 0], (0, 26): [434, 0], (0, 27): [434, 0], (1, 0): [434, 0], (1, 1): [434, 0], (1, 2): [434, 0], (1, 3): [434, 0], (1, 4): [434, 0], (1, 5): [434, 0], (1, 6): [434, 0], (1, 7): [434, 0], (1, 8): [434, 0], (1, 9): [434, 0], (1, 10): [434, 0], (1, 11): [434, 0], (1, 12): [434, 0], (1, 13): [434, 0], (1, 14): [434, 0], (1, 15): [434, 0], (1, 16): 

#The preditions are done on test data and stored in a dictionary for futher evaluation

In [18]:
prediction = predict(model[0],model[1])

#The confusion Matrix is created for evaluating the model

In [88]:
test_lookup_table = []
nums_list = [-1,0,1,2,3,4,5,6,7,8,9]
sum_conf = defaultdict(lambda:0)
with open("/content/testing_label.txt") as training_labels:
  for line in training_labels.readlines():
    test_lookup_table.append(int(line.rstrip()))

conf_mat_in_count = compute_confusion_matrix(test_lookup_table,list(prediction.values()))
print("The confusion matrix in count is shown below\n")
for nums in nums_list:
  print(f"{nums:3d}",end = " ")


print("\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n")
for ind1,each_col in enumerate(conf_mat_in_count):
  for ind2, each_item in enumerate(each_col):
    if(ind2 == 0):
      print(f"{ind1:3d}", end=" ")
      print("|", end =" ")
    sum_conf[ind1] += each_item
    print(f"{each_item:3d}", end=" ")
  print()

print("\nThe Confusion matrix in percentage is shown below.\n")

for nums in nums_list:
  print(f"{nums:5d}",end = " ")
print("\n   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n")
for ind1,each_col in enumerate(conf_mat_in_count):
  for ind2, each_item in enumerate(each_col):
    if(ind2 == 0):
      print(f"{ind1:5d}", end=" ")
      print("|", end =" ")
    temp = (each_item/sum_conf[ind1])*100
    print(f"{temp:5.2f}", end=" ")
  print()




The confusion matrix in count is shown below

 -1   0   1   2   3   4   5   6   7   8   9 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

  0 |  76   0   1   0   1   5   3   0   4   0 
  1 |   0 104   1   0   0   2   1   0   0   0 
  2 |   1   3  81   4   2   0   6   1   5   0 
  3 |   0   1   0  80   0   3   2   7   1   6 
  4 |   0   0   1   0  80   1   4   1   2  18 
  5 |   2   1   1  12   3  63   1   1   2   6 
  6 |   1   4   4   0   4   6  70   0   2   0 
  7 |   0   5   4   0   3   0   0  77   3  14 
  8 |   1   1   3  14   3   9   0   1  61  10 
  9 |   1   1   0   3   9   2   0   2   1  81 

The Confusion matrix in percentage is shown below.

   -1     0     1     2     3     4     5     6     7     8     9 
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    0 | 84.44  0.00  1.11  0.00  1.11  5.56  3.33  0.00  4.44  0.00 
    1 |  0.00 96.30  0.93  0.00  0.00  1.85  0.93  0.00  0.00  0.00 
    2 |  0.97  2.91 78.64  3.88  1.94  0.00  5.83  0.97  4.85 

# The model is evaluated to find the accuracy.

In [89]:
correct = 0
for index,pred in prediction.items():
  if(test_lookup_table[index] == pred):
    correct  +=1

print("The accuracy of the model is",(correct/len(test_lookup_table))*100,"%")


The accuracy of the model is 77.3 %


#Below cell will print the classified test images with serial number and actual number

In [90]:
print('  {0}      {1}   {2}'.format("S.No" ,"Predicted", "Actual"))
for index,pred in prediction.items():
  print('{0:4d}          {1:1d}          {2:1d}'.format(index+1, pred,test_lookup_table[index]))
  print()

  S.No      Predicted   Actual
   1          7          9

   2          0          0

   3          2          2

   4          3          5

   5          1          1

   6          9          9

   7          7          7

   8          8          8

   9          1          1

  10          0          0

  11          4          4

  12          1          1

  13          9          7

  14          9          9

  15          4          6

  16          9          4

  17          2          2

  18          2          6

  19          8          8

  20          1          1

  21          3          3

  22          7          7

  23          9          5

  24          4          4

  25          8          4

  26          1          1

  27          8          8

  28          1          1

  29          3          3

  30          8          8

  31          1          1

  32          2          2

  33          8          5

  34          8          8

  35          0  