# Big Data Final Code

### Library Imports

In [29]:
# %pip install --quiet mrjob==0.7.4
# %pip install --quiet treelib

In [68]:
import numpy as np
import pandas as pd
from treelib import Tree
import subprocess
import os

### Creating an MRJob

In [33]:
%%file q1.py
import numpy as np
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

att_names = ['lymphatics', 'block_of_affere', 'bl_of_lymph_c', 'bl_of_lymph_s', 'by_pass', 'extravasates', 'regeneration_of', 'early_uptake_in', 'lym_nodes_dimin', 'lym_nodes_enlar',
             'changes_in_lym', 'defect_in_node', 'changes_in_node', 'changes_in_stru', 'special_forms', 'dislocation_of', 'exclusion_of_no', 'no_of_nodes_in', 'class']


class MRMostUsedWord(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,reducer=self.reducer_count_words),
            MRStep(mapper=self.mapper_entopy, reducer=self.reducer_gain),
        ]

    def mapper_get_words(self, _, line):
        records = line.split(',')
        if(records[8].isdigit()):
            for i in range(0,len(records)-1):
                yield((att_names[i],records[i],records[-1]),1)


    def reducer_count_words(self, key, val):
        yield (key,sum(val))

    def mapper_entopy(self, key, val):
        yield key[0], [key[1], key[2], val]
    
    def reducer_gain(self, key, val):
        l = list(val)
        total_entropy = 0
        weighted_entropy = 0
        split_info = 0
        total = 0
        target = {}
        attribute = {}

        for i in l:
            if i[1] in target.keys():
                target[i[1]] += i[-1]
            else:
                target[i[1]] = i[-1]

            if i[0] in attribute.keys():
                attribute[i[0]] += i[-1]
            else:
                attribute[i[0]] = i[-1]

            total += i[-1]

        for i in attribute.keys():
            att_tar = {}
            att_entropy = 0

            for j in l:
                if(j[0]==i):
                    if j[1] in att_tar.keys():
                        att_tar[j[1]] += j[-1]
                    else:
                        att_tar[j[1]] = j[-1]
            
            for k in att_tar.values():
                att_entropy += (-k/attribute[i])*np.log2(k/attribute[i]) if (k/attribute[i])!=0 else 0
            
            weighted_entropy += (attribute[i]/total)*att_entropy
            split_info += (-attribute[i]/total)*np.log2(attribute[i]/total) if (attribute[i]/total)!=0 else 0


        for i in target.values():
            total_entropy += (-i/total)*np.log2(i/total) if (i/total)!=0 else 0
        
        gain_ratio = (total_entropy - weighted_entropy)/split_info if split_info != 0 else 0

        yield key, gain_ratio

if __name__ == '__main__':
    MRMostUsedWord.run()

Overwriting q1.py


#### Trial of the MR Job

In [41]:
!python q1.py "lymph.csv"

"bl_of_lymph_c"	0.05085493080437394
"bl_of_lymph_s"	0.14468608714865394
"block_of_affere"	0.17516765733735065
"by_pass"	0.09160735105383414
"changes_in_lym"	0.12189329255927196
"changes_in_node"	0.24603013044727012
"changes_in_stru"	0.07070000255653218
"defect_in_node"	0.08745251518655868
"dislocation_of"	0.06913674928649483
"early_uptake_in"	0.15253928934299005
"exclusion_of_no"	0.088573034933014
"extravasates"	0.029123799565949588
"lym_nodes_dimin"	0.5647091545931281
"lym_nodes_enlar"	0.12052586081609619
"lymphatics"	0.09727814243153217
"no_of_nodes_in"	0.1328311730039287
"regeneration_of"	0.38020964533911633
"special_forms"	0.1254667547340289


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\Ngs11\AppData\Local\Temp\q1.Ngs11.20221217.083159.832226
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\Ngs11\AppData\Local\Temp\q1.Ngs11.20221217.083159.832226\output
Streaming final output from C:\Users\Ngs11\AppData\Local\Temp\q1.Ngs11.20221217.083159.832226\output...
Removing temp directory C:\Users\Ngs11\AppData\Local\Temp\q1.Ngs11.20221217.083159.832226...


In [89]:
cmd = subprocess.run(["python", "q1.py", "lymph.csv"], capture_output=True)
stdout = cmd.stdout.decode()
best_val = 0
best_feature = None
for item in stdout.strip().replace('"', '').split('\n'):
    # print(item)
    att, val = item.split('\t')
    # print(att, val)
    if(float(val)>best_val):
        best_feature = att
        best_val = float(val)
print(best_feature, best_val)

lym_nodes_dimin 0.5647091545931281


### Building a Decision Tree

* logarithm to the base 2

In [35]:
def log(val):
    if val == 0:
        return 0
    else:
        return np.log2(val)

* Entropy Calculation

In [36]:
def entropy(target_col):  
    
    elements, counts = np.unique(target_col,return_counts = True)  
    entropy = np.sum([(-counts[i]/np.sum(counts))*log(counts[i]/np.sum(counts)) for i in range(len(elements))])  
    return entropy

* Gain Ratio Calculation

In [37]:
def GainRatio(data,split_attribute_name,target_attribute_name="Loan_Status"):  
         
    #Calculate the entropy of the total dataset  
    total_entropy = entropy(data[target_attribute_name])  
    # print(total_entropy)
    ##Calculate the entropy of the dataset  
      
    #Calculate the values and the corresponding counts for the split attribute   
    vals, counts= np.unique(data[split_attribute_name],return_counts=True)  
      
    #Calculate the weighted entropy  
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data[data[split_attribute_name]==vals[i]][target_attribute_name]) for i in range(len(vals))])  
    # print(Weighted_Entropy)
    #Calculate split information gain
    Split_Info = np.sum([(-counts[i]/np.sum(counts))*log(counts[i]/np.sum(counts)) for i in range(len(vals))])

    #Calculate the information gain  
    Information_Gain = total_entropy - Weighted_Entropy  

    #Calculate the gain ratio
    Gain_Ratio = Information_Gain/Split_Info if Split_Info != 0 else 0
    # print(Gain_Ratio)

    return Gain_Ratio 

* Add node function to add a node to the tree

In [38]:
def add_node(tree, best_feature, pid, path):
    if(pid == None):
        tree.create_node(tag = best_feature, identifier = best_feature)
        return best_feature
    else:
        tree.create_node(str(path) + ":" + best_feature, pid + "/" + str(path) + "/" + best_feature, pid)
        return pid + "/" + str(path) + "/" + best_feature

### C45 Algo

In [96]:
def C45(tree, data, features, target_attribute_name="class", parent_node_class = None, pid = None, path = None, lvl = 0):  
  
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#  
      
    #If all target_values have the same value, return this value  
    if len(np.unique(data[target_attribute_name])) <= 1:
        best_feature = np.unique(data[target_attribute_name])[0]  
        pid = add_node(tree, best_feature, pid, path)
        
    #If the feature space is empty, return the mode target feature value of the direct parent node 
    elif len(features) ==0:
        best_feature = parent_node_class 
        pid = add_node(tree, best_feature, pid, path) 
      
    #If none of the above holds true, grow the tree!  
    else:  
        #Set the default value for this node --> The mode target feature value of the current node  
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]  
          
        # #Select the feature which best splits the dataset  
        # item_values = [GainRatio(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset  
        # # print(features)
        
        # best_feature_index = np.argmax(item_values)  
        # best_feature = features[best_feature_index]

        fname = 'lymph' + str(lvl) + '.csv'

        data.to_csv(fname, index=False)

        cmd = subprocess.run(["python", "q1.py", fname], capture_output=True)
        stdout = cmd.stdout.decode()
        # print(stdout)
        best_val = 0
        best_feature = None
        for item in stdout.strip().replace('"', '').split('\n'):
            att, val = item.split('\t')
            if(float(val)>best_val):
                best_feature = att
                best_val = float(val)
        # print(best_feature, best_val)
          
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information  
        #gain in the first run  
        
        pid = add_node(tree, best_feature, pid, path)
        # for node in  tree.all_nodes_itr():
        #     print(node.identifier)
          
        #Remove the feature with the best inforamtion gain from the feature space  
        features = [i for i in features if i != best_feature]  
          
        #Grow a branch under the root node for each possible value of the root node feature  
          
        for val in np.unique(data[best_feature]): 
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets  
            sub_data = data[data[best_feature] == val]
            # print(value)  
              
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!  
            C45(tree, sub_data, features, target_attribute_name, parent_node_class, pid, val, lvl+1)

        os.remove(fname)
        del data

## Generating Decision Tree

In [None]:
data = pd.read_csv("lymph0.csv")

['lymphatics', 'block_of_affere', 'bl_of_lymph_c', 'bl_of_lymph_s', 'by_pass', 'extravasates', 'regeneration_of', 'early_uptake_in', 'lym_nodes_dimin', 'lym_nodes_enlar', 'changes_in_lym', 'defect_in_node', 'changes_in_node', 'changes_in_stru', 'special_forms', 'dislocation_of', 'exclusion_of_no', 'no_of_nodes_in', 'class']


In [98]:
tree = Tree()
C45(tree, data, data.columns[:-1], 'class') 
tree.show()

lym_nodes_dimin
├── 1:changes_in_node
│   ├── lac_central:exclusion_of_no
│   │   ├── no:bl_of_lymph_c
│   │   │   ├── no:malign_lymph
│   │   │   └── yes:metastases
│   │   └── yes:lym_nodes_enlar
│   │       ├── 2:changes_in_stru
│   │       │   ├── diluted:metastases
│   │       │   └── faint:malign_lymph
│   │       ├── 3:malign_lymph
│   │       └── 4:malign_lymph
│   ├── lac_margin:block_of_affere
│   │   ├── no:extravasates
│   │   │   ├── no:lymphatics
│   │   │   │   ├── arched:early_uptake_in
│   │   │   │   │   ├── no:changes_in_lym
│   │   │   │   │   │   ├── oval:metastases
│   │   │   │   │   │   └── round:changes_in_stru
│   │   │   │   │   │       ├── drop_like:malign_lymph
│   │   │   │   │   │       └── faint:metastases
│   │   │   │   │   └── yes:no_of_nodes_in
│   │   │   │   │       ├── 2:malign_lymph
│   │   │   │   │       ├── 3:malign_lymph
│   │   │   │   │       └── 4:metastases
│   │   │   │   ├── deformed:metastases
│   │   │   │   └── displaced:malign_lymph