In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
columns = ['ide', 'lang', 'bev', 'os', ]
df = pd.read_csv('train.csv')
df = df.iloc[:, :4]
df.head()

Unnamed: 0,ide,lang,bev,os
0,vim,c,tea,win
1,jupyter,python,tea,mac
2,jupyter,python,tea,mac
3,jupyter,python,coffee,mac
4,jupyter,python,coffee,win


In [3]:
t = df.keys()[0]
attribute_names = list(df.keys())
attribute_names.remove(t) 

In [4]:
#Function to calculate the entropy of collection S
import math
def entropy(probs):  
    return sum( [-prob*math.log(prob, 7) for prob in probs])

#Function to calulate the entropy of the given Data Sets/List with 
def entropy_of_list(ls,value):  
    from collections import Counter
    cnt = Counter(x for x in ls)
    print('Target attribute class count(Yes/No)=',dict(cnt))
    total_instances = len(ls)  
    print("Total no of instances/records associated with {0} is: {1}".format(value,total_instances ))
    probs = [x / total_instances for x in cnt.values()] 
    print("Probability of Class {0} is: {1:.4f}".format(min(cnt),min(probs)))
    print("Probability of Class {0} is: {1:.4f}".format(max(cnt),max(probs)))
    return entropy(probs) 

In [5]:
def information_gain(df, split_attribute, target_attribute,battr):
    print("\n\n-----Information Gain Calculation of ",split_attribute, " --------") 
    df_split = df.groupby(split_attribute) 
    glist=[]
    for gname,group in df_split:
        print('Grouped Attribute Values \n',group)
        glist.append(gname) 
    
    glist.reverse()
    nobs = len(df.index) * 1.0   
    df_agg1=df_split.agg({target_attribute:lambda x:entropy_of_list(x, glist.pop())})
    df_agg2=df_split.agg({target_attribute :lambda x:len(x)/nobs})
    
    df_agg1.columns=['Entropy']
    df_agg2.columns=['Proportion']
    
    # Calculate Information Gain:
    new_entropy = sum( df_agg1['Entropy'] * df_agg2['Proportion'])
    if battr !='S':
        old_entropy = entropy_of_list(df[target_attribute],'S-'+df.iloc[0][df.columns.get_loc(battr)])
    else:
        old_entropy = entropy_of_list(df[target_attribute],battr)
    return old_entropy - new_entropy

In [6]:
def id3(df, target_attribute, attribute_names, default_class=None,default_attr='S'):
    
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute])# class of YES /NO
    
    if len(cnt) == 1:
        return next(iter(cnt))
    
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
        # Compute the Information Gain of the attributes:
        gainz=[]
        for attr in attribute_names:
            ig= information_gain(df, attr, target_attribute,default_attr)
            gainz.append(ig)
            print('Information gain of ',attr,' is : ',ig)
        
        index_of_max = gainz.index(max(gainz))               
        best_attr = attribute_names[index_of_max]  
        print("\nAttribute with the maximum gain is: ", best_attr)
        tree = {best_attr:{}}
        remaining_attribute_names =[i for i in attribute_names if i != best_attr]
        

        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,target_attribute, remaining_attribute_names,default_class,best_attr)
            tree[best_attr][attr_val] = subtree
        return tree

In [7]:
tree = id3(df, t, attribute_names)



-----Information Gain Calculation of  lang  --------
Grouped Attribute Values 
         ide lang  bev   os
0       vim    c  tea  win
13  pycharm    c  tea  win
Grouped Attribute Values 
        ide lang     bev   os
11     vss  c++     tea  win
12  vscode  c++  coffee  win
Grouped Attribute Values 
         ide  lang  bev   os
10  pycharm  java  tea  win
Grouped Attribute Values 
   ide    lang  bev   os
5  AS  kotlin  tea  win
Grouped Attribute Values 
       ide    lang     bev   os
8  matlab  matlab  coffee  win
Grouped Attribute Values 
        ide    lang     bev    os
1  jupyter  python     tea   mac
2  jupyter  python     tea   mac
3  jupyter  python  coffee   mac
4  jupyter  python  coffee   win
6  jupyter  python     tea   win
7  pycharm  python  coffee   win
9  pycharm  python     tea  unix
Target attribute class count(Yes/No)= {'vim': 1, 'pycharm': 1}
Total no of instances/records associated with c is: 2
Probability of Class pycharm is: 0.5000
Probability of Class vim is:

Target attribute class count(Yes/No)= {'jupyter': 5, 'pycharm': 2}
Total no of instances/records associated with S-python is: 7
Probability of Class jupyter is: 0.2857
Probability of Class pycharm is: 0.7143
Information gain of  os  is :  0.16726250301495435

Attribute with the maximum gain is:  os


-----Information Gain Calculation of  bev  --------
Grouped Attribute Values 
        ide    lang     bev   os
4  jupyter  python  coffee  win
7  pycharm  python  coffee  win
Grouped Attribute Values 
        ide    lang  bev   os
6  jupyter  python  tea  win
Target attribute class count(Yes/No)= {'jupyter': 1, 'pycharm': 1}
Total no of instances/records associated with coffee is: 2
Probability of Class jupyter is: 0.5000
Probability of Class pycharm is: 0.5000
Target attribute class count(Yes/No)= {'jupyter': 1}
Total no of instances/records associated with tea is: 1
Probability of Class jupyter is: 1.0000
Probability of Class jupyter is: 1.0000
Target attribute class count(Yes/No)= {'jup

In [8]:
def classify(instance, tree,default=None):   
    attribute = next(iter(tree))    
    if instance[attribute] in tree[attribute].keys():  keys  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): 
            return classify(instance, result)
        else:
            return result
    else:
        return default

In [9]:
df_train = pd.read_csv('test.csv')
df_train['ide'] = df_train.apply(classify, axis=1, args=(tree,'?')) 
df_train

Unnamed: 0,ide,lang,bev,os,Unnamed: 4
0,pycharm,python,coffee,win,
1,jupyter,python,coffee,mac,
2,?,c#,coffee,unix,
3,pycharm,python,tea,unix,
4,vss,c++,tea,unix,
5,AS,kotlin,coffee,win,


In [10]:
df_ = pd.read_csv('to_predict.csv')
df_

Unnamed: 0,ide,lang,bev,os,Unnamed: 4
0,jupyter,python,coffee,win,
1,jupyter,python,coffee,mac,
2,vscode,c#,coffee,unix,
3,vim,python,tea,unix,
4,vss,c++,tea,unix,
5,jupyter,kotlin,coffee,win,
