In [0]:
#Upload a File as CSV and Store
from google.colab import files
import io

inputfile= files.upload()
list_of_doc = io.StringIO(inputfile['owls_1.csv'].decode('utf-8')).readlines()

Saving owls_1.csv to owls_1.csv


In [0]:
#Make the Data Readable
finalList = []
DataFile = []
for i in list_of_doc:
    finalList.append(i.split('\r')[0])
    
for j in finalList:
  DataFile.append(j.split(","))

header = ["body-length", "wing-length", "body-width", "wing-length", "type"]

In [0]:
#Randomize the Data and Split into Train and Test Data Sets with ratio of 66.67% Train to 33.33% Test
import random

sep = DataFile
random.seed(10)
random.shuffle(sep)
cut_point = int(len(sep) * 0.6667)
train = sep[:cut_point]
test = sep[cut_point:]

In [0]:
#Finding out the Classes of the Output Required
def uniq(r, c):
   return set([r[c] for r in r])
  
uniq(DataFile, 4)

{'BarnOwl', 'LongEaredOwl', 'SnowyOwl'}

In [0]:
#Creating a Count Opration on DataSet to determine the length of each entry and Checking for the Numeric Values
def Ccounts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts
  
def numeric_check(v):
  v = float(v)
  return isinstance(v, int) or isinstance(v, float)
  
print(Ccounts(DataFile))
Ccounts(train)
numeric_check(train[0][3])

{'LongEaredOwl': 45, 'BarnOwl': 45, 'SnowyOwl': 45}


True

In [0]:
#Creating a Question Algorithm method in which the Question is being Asked to the System and Then the Segregation happens
class Algorithm:
  
  def __init__(self, c, v):
    self.c = c
    self.v = v

  def matchValues(self, e):
    val = float(e[self.c])
    if numeric_check(val):
        return val <= self.v
    else:
        return val == self.v

  def __repr__(self):
    condition = "=="
    if numeric_check(self.v):
        condition = "<="
    return "Is %s %s %s?" % (
        header[self.c], condition, str(self.v))

In [0]:
#Based on Question Asked, Partition happens to True and False Values which will help in Training the DataSet to get all the Possible Answers
def partition(rows, question):
  true_values, false_values = [], []
  for row in rows:
    if question.matchValues(row):
        true_values.append(row)
    else:
        false_values.append(row)
  return true_values, false_values

In [0]:
#Calculate the Impurities in the Dataset and carrying out the Probability of Answer being Incorrect
def calculate_impurity(rows):
    counts = Ccounts(rows)
    impurity = 1
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label**2
    return impurity

gin_train = calculate_impurity(train[4])
gin_train

0.7199999999999999

In [0]:
# This is one of the most core part of the Algorithm: 
# Calculating the Information gain (Higher Value for the Lower Impurities) 
# and Based on Gain deciding the Best Question for the Algorithm to be segregated Properly
def information_gain(l, r, uncertain):
  p = float(len(l)) / (len(l) + len(r))
  return uncertain - p * calculate_impurity(l) - (1 - p) * calculate_impurity(r)

def find_best_split(rows):
    best_gain = 0 
    best_question = None 
    current_uncertainty = calculate_impurity(rows)
    n_features = len(rows[0]) - 1 

    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:

            question = Algorithm(col, float(val))
            true_rows, false_rows = partition(rows, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            gain = information_gain(true_rows, false_rows, current_uncertainty)
            if gain > best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [0]:
best_gain, best_question = find_best_split(train)
best_question

Is body-width <= 1.9?

In [0]:
# Creating a Leaf and Decision Node to generate the Prediction for each row and based on the Question for True and False Branch
class Leaf:
   def __init__(self, rows):
        self.predictions = Ccounts(rows)
        
class Decision_Node:
   def __init__(self, question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [0]:
# This is another core of the Algorithm where we are creating a Tree for the Question and Segregation of the Predicted Data Set
def build_tree(rows):
   
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)
  
my_tree = build_tree(train)

In [0]:
# The Classify algorithm takes the Each row of Data Set(Mostly Test data) and does the Prediction.
# Print_leaf prints each Entry with Predicted and Actual Values
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions

    if node.question.matchValues(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)
      
def print_leaf(counts):
  total = sum(counts.values()) * 1.0
  for k, v in counts.items():
    key = k
  return key

print_leaf(classify(train[0], my_tree))

'LongEaredOwl'

In [0]:
#The Method to calculate the Accuracy which is Truly Predicted values upon total values in data Set
def Find_Accuracy(train1, test1) :
  column1 = []
  column2 = []
  count_true = 0
  count_false = 0
  my_tree = build_tree(train1)
  for row in test1 :
    column1.append(row[-1])
    column2.append(print_leaf(classify(row, my_tree)))
 
  for index in range(len(test1)):
    print("Actual: %s.           Predicted: %s" % (column1[index], column2[index]))
    if(column1[index] == column2[index]):
      count_true = count_true + 1
    else : count_false = count_false + 1

  Accuracy = (count_true/(count_true + count_false))*100
  return Accuracy

Find_Accuracy(train,test)

In [0]:
#The Process to be repeated 10 Times to genearte Different Samples and calculating the Accuracy
sep1 = DataFile
random.seed(100)
random.shuffle(sep1)
cut_point = int(len(sep1) * 0.6667)
train_result = sep1[:cut_point]
test_result = sep1[cut_point:]

Accuracy1 = Find_Accuracy(train_result, test_result)
print(Accuracy1)