In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('bank-additional-full.csv',sep=';')

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
myDataY=data[data.y=='yes']

In [5]:
myDataN=data[data.y=='no']

In [137]:
def shuffle2(df, n=1, axis=0):     
       for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
        return df

In [138]:
myFinalData=pd.concat([myDataN.sample(n=4444,replace=False,),myDataY])
myFinalData = myFinalData.sample(frac=1).reset_index(drop=True)

tmpHolder=myFinalData.dtypes

#converting to categorical type
for i in range(len(tmpHolder)):
    if(tmpHolder[i]=='object'):
        myFinalData[tmpHolder.index[i]]=myFinalData[tmpHolder.index[i]].astype('category')

In [139]:
trainSet,testSet=train_test_split(myFinalData,test_size=0.25)

## Decision Stump

In [140]:
def entropy(probs):
    return sum([-prob*np.log2(prob) for prob in probs])

In [141]:
def entropy_list(list_item):
    #print(list_item)
    from collections import Counter
    mp = Counter(x for x in list_item)
    total=1.0*len(list_item)

    probs = [tt/total for tt in mp.values()]
    return entropy(probs)

In [261]:
def information_gain_cat(df, split_attribute_name, target_attribute_name):
    
    dfTmp=df.copy(deep=True)
    #Split
    df_split = dfTmp.groupby(split_attribute_name)
    
    dct={}
    
    nrows=len(dfTmp.index)*1.0
    
    #Merge
    for name,group in df_split:
    #   print(name,entropy_categorical(np.array(group['y'],dtype=pd.Series)),len(group)/nrows)
        dct[name]=[entropy_list(np.array(group[target_attribute_name],dtype=pd.Series)),len(group)/nrows]

    dd = pd.DataFrame(dct)
    dd=dd.T
    dd.columns=['Entropy','ObsProp']
    
    #print(dd)
    
    #Gain
    
    new_entropy = sum( dd['Entropy'] *dd['ObsProp'] )
    
    
    old_entropy = entropy_list(dfTmp[target_attribute_name])
    
    
    return old_entropy-new_entropy

In [262]:
information_gain_cat(trainSet,'job','y')

0.03589354511714926

In [292]:
def information_gain_num(df,split_attribute_name,target_attribute_name):
    dfTemp=df.copy(deep=True)
    
    dfTemp=dfTemp.sort_values(split_attribute_name)
    
    split_points=dfTemp[split_attribute_name].unique()
    #print(dfTemp[split_attribute_name].unique())
    
    gains={}
    
    cols=list(dfTemp.columns)
    
    #print(cols)
    
    
    pos=cols.index(split_attribute_name)
    #print(pos)
    
    parent_entr=entropy_list(dfTemp[target_attribute_name])
    
    
    best_split=-np.inf
    best_gain=-np.inf
    
    
    for split_point in split_points:
        
        subset_df1 = dfTemp[dfTemp[split_attribute_name] <= split_point]
        subset_df2 = dfTemp[dfTemp[split_attribute_name] > split_point]
        
        s1_entr=entropy_list(subset_df1[target_attribute_name])
        s2_entr=entropy_list(subset_df2[target_attribute_name])
            
        child_entr=s1_entr*len(subset_df1.index)+s2_entr*len(subset_df2.index)
        child_entr/=len(dfTemp.index)
        tmpGain=parent_entr-child_entr
        
        if((best_gain==-np.inf and best_split==-np.inf)or(tmpGain>best_gain)):
            best_split=split_point
            best_gain=tmpGain
        #print(split_point,tmpGain)
    
    return best_gain,best_split
    

In [293]:
information_gain_num(myFinalData,'emp.var.rate','y')

(0.13651874166436118, -1.1)

In [294]:
class DecisionNode:
    def __init__(self,attr_name,kind):
        self.attr_name=attr_name
        self.kind=kind
    
    def setThreshold(self,threshold):
        if(self.kind=='numeric'):
            self.threshold=threshold
        else:
            raise ValueError("Threshold can't be set for categorical variable")
    
    def setLessThanEq(self,cls):
        if(self.kind=='numeric'):
            self.less_than_eq_class=cls
        else:
            raise ValueError("Invalid for categorical variable")
    
    def setGreaterThan(self,cls):
        if(self.kind=='numeric'):
            self.greater_than_class=cls
        else:
            raise ValueError("Invalid for categorical variable")
    
    def setLevelClassMap(self,levelClassMap):
        if(self.kind=='categorical'):
            self.levelClassMap=levelClassMap
        else:
            raise ValueError("Invalid for numeric variable")

In [295]:
def isCategorical(df,attr):
    if(df[attr].dtype.name=='category'):
        return True
    return False

In [399]:
from operator import itemgetter
def DecisionStump(mydf,all_attr,target_attr,default_class=None):
    df=mydf.copy(deep=True)
    
    gains=[] 
    for attr in all_attr:
        if(isCategorical(df,attr)):
            ig = information_gain_cat(df=df,split_attribute_name=attr,target_attribute_name=target_attr)
            splt=-np.inf
        else:
            ig,splt=information_gain_num(df=df,split_attribute_name=attr,target_attribute_name=target_attr)
        gains.append((ig,splt))
        
    mx_gain=max(gains,key=itemgetter(0))[0]
    
    print(mx_gain)
    
    mx_gain_idx=-1
    for i in range(len(gains)):
        if(gains[i][0]==mx_gain):
            mx_gain_idx=i
            break
#     print(gains)
#     print(all_attr[mx_gain_idx])
    
    mx_gain_attr=all_attr[mx_gain_idx]
    
    print(mx_gain_attr)
    
    #splitting for Numeric Attributes
    if(not(isCategorical(df,mx_gain_attr))):
        thresh=gains[mx_gain_idx][1]
        df1=df[df[mx_gain_attr]<=thresh]
        df2=df[df[mx_gain_attr]>thresh]
        dn=DecisionNode(mx_gain_attr,'numeric')
        dn.setThreshold(thresh)
        ly=df1[df1.y=='yes'].shape[0]
        ln=df1[df1.y=='no'].shape[0]
        
        if(ly>=ln):
            dn.setLessThanEq('yes')
        else:
            dn.setLessThanEq('no')
        gy=df2[df2.y=='yes'].shape[0]
        gn=df2[df2.y=='no'].shape[0]
        if(gy>=gn):
            dn.setGreaterThan('yes')
        else:
            dn.setGreaterThan('no')
        
        return dn
    
    
    else:
        dn=DecisionNode(mx_gain_attr,'categorical')
        df_split = df.groupby(mx_gain_attr)

        dct={}
        for name,group in df_split:
            y=group[group.y=='yes'].shape[0]
            n=group[group.y=='no'].shape[0]
            if(y>=n):
                dct[name]='yes'
            else:
                dct[name]='no'
            #print(name,group[group.y=='yes'].shape[0],group[group.y=='no'].shape[0])
            
        print(dct)
            #dct[name]=[entropy_list(np.array(group[target_attribute_name],dtype=pd.Series)),len(group)/nrows]
        dn.setLevelClassMap(dct)
        return dn

In [407]:
def Predict(df,h):
    attr=h.attr_name
    dataPoints=df[attr]
    y_pred=[]
    if(h.kind=='categorical'):
        dct=h.levelClassMap
        y_pred= [dct[x] for x in dataPoints]
    else:
        thresh=h.threshold
        for x in dataPoints:
            if(x<=thresh):
                y_pred.append(h.less_than_eq_class)
            else:
                y_pred.append(h.greate_than_class)
    return y_pred

In [400]:
Test_Categorical=trainSet[['age','job','marital','education','default','housing','y']]

In [401]:
catcolumns=list(Test_Categorical.columns)
catcolumns.remove('y')

In [402]:
h=DecisionStump(trainSet,catcolumns,'y')

0.03589354511714926
job
{'admin.': 'yes', 'blue-collar': 'no', 'entrepreneur': 'no', 'housemaid': 'no', 'management': 'yes', 'retired': 'yes', 'self-employed': 'yes', 'services': 'no', 'student': 'yes', 'technician': 'no', 'unemployed': 'yes', 'unknown': 'yes'}


In [404]:
y_pred=Predict(testSet,h)

In [405]:
y_act=testSet['y']
y_act=np.array(y_act,dtype=pd.Series)

In [406]:
cnt=0
for i in range(len(y_act)):
    if(y_act[i]==y_pred[i]):
        cnt+=1

print(cnt," ",len(y_pred))
print("accuracy ",100.0*cnt/len(y_pred))

1298   2271
accuracy  57.15543813298107


In [None]:

def ID3(df,target_attr,all_attr,depth_available,default_class=None):
    
    from collections import Counter
    cnt=Counter(x for x in df[target_attr])
    
    #splitting is done
    if(len(cnt)==1):
        return list(cnt.keys())[0]
    
    elif df.empty or (not(all_attr)) or not(depth_available):
        return default_class
    else:
        #default value for next recursive call
        
       # print(max(cnt.values()))
        #print(cnt.values())
        mx_indx = list(cnt.values()).index(max(cnt.values()))
        default_class=list(cnt.keys())[mx_indx]
        
        #print("default class ",depth_available," ",mx_indx," ",default_class)
        
        #choose best attribute to split on
        gains =[information_gain(df,attr,target_attr) for attr in all_attr]
        indx_mx_gain = gains.index(max(gains))
        best_attr=all_attr[indx_mx_gain]
        
        
        #empty tree
        tree={best_attr:{}}
        rest_attr=[at for at in all_attr if at!=best_attr]
        
        #splitting the dataset
        
        for attr,data in df.groupby(best_attr):
            subtree=ID3(data,target_attr,rest_attr,depth_available-1,default_class)
            
            tree[best_attr][attr]=subtree
        
        return tree

       
def classify(instance, tree, default=None):
    attribute = list(tree.keys())[0]
    
    if instance[attribute] in tree[attribute].keys():
        
        result = tree[attribute][instance[attribute]]
        
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [None]:
all_attr=list(trainSet.columns)
all_attr.remove('y')

In [None]:
testSet['predicted'] =testSet.apply(classify, axis=1, args=(tree,'no') ) 

In [None]:
print('Accuracy is ' + str( sum(testSet['y']==testSet['predicted'] ) / (1.0*len(testSet.index)) ))
testSet[testSet['predicted']=='yes']

In [None]:
# input: dataset X and labels y (in {+1, -1})
class AdaBoost:
    def __init__(self,num_iterations):
        self.hyp= []
        self.hyp_wgt = []
        self.num_iterations=num_iterations

        pass
    
    
    def train(self,df):
        
        
        sampN, _ = df.shape
        samp_wgt = np.ones(sampN) / sampN
        
        for t in range(self.num_iterations):
          #  print(samp_wgt)
            
            sampled_data=df.sample(n=sampN,replace=True,weights=samp_wgt)
            
            X=sampled_data.drop(columns=['y'])
            y=sampled_data['y']
        
            
            h = DecisionTreeClassifier(max_depth=1)

            h.fit(X, y, sample_weight=samp_wgt)
            pred = h.predict(X)

            eps = samp_wgt.dot(pred != y)
            alpha = (np.log(1 - eps) - np.log(eps)) / 2
            
            print(eps)

            samp_wgt = samp_wgt * np.exp(- alpha * y * pred)
            samp_wgt = samp_wgt / samp_wgt.sum()
            
            samp_wgt=np.array(samp_wgt,dtype=pd.Series)

            self.hyp.append(h)
            self.hyp_wgt.append(alpha)
        return samp_wgt
    
    def test(self,X):
        
        sampN, _ = X.shape
        samp_wgt = np.ones(sampN) / sampN
        y=np.zeros(sampN)
        
        
        for (h, alpha) in zip(self.hyp, self.hyp_wgt):
            y = y + alpha * h.predict(X)
        
        print(y)
        
        y = np.sign(y)
        
        return y


In [None]:
X=data2.drop(columns=['y'])

In [None]:
y=data2['y']

In [None]:
asd = AdaBoost(20)

In [None]:
sw=asd.train(data2)
sw=sw.astype('float64')

In [None]:
y_pred = asd.test(X)

In [None]:
y_act = data2['y']

In [None]:
len(y_act),len(y_pred)

In [None]:
count=0
for i in range(len(y_act)):
    if(y_act[i]==y_pred[i]):
        count+=1

print(100*count/len(y_act))

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_true=y_act,y_pred=y_pred,average=None),f1_score(y_act, y_pred, average='weighted',sample_weight=sw)

In [None]:
data.dtypes

In [None]:
ss = data.dtypes
data2 = data

In [None]:
for i in range(0,len(ss)):
    if(ss[i]=='object'):
        print(ss[i], ss.index[i])
        data2[ss.index[i]]=data2[ss.index[i]].astype('category')
        data2[ss.index[i]]=data2[ss.index[i]].cat.codes
    

In [None]:
data2

In [None]:
data2['y'] = data2['y'].map({0: -1,1: 1})
data2