In [1]:
import numpy as np
from math import log
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

In [2]:
#bagging use all the feature where random forest use some.

In [3]:
def gini(rows):
    classes=count_class_freq(rows)
    impurity = 1
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c**2
    return impurity
def entropy(rows):
    classes=count_class_freq(rows)
    impurity = 0
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c* log(prob_of_c, 2)
    return impurity

In [4]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [5]:
def count_class_freq(rows):
    #last column is the class
    classes={} #dictionary
    for row in rows:
        c=row[-1]
        if c not in classes:
            classes[c]=1
        else:
            classes[c]+=1
    return classes

In [6]:
'''
Slit numerical data into left and right branch based on given threshold th
'''
def split_rows(dt, col, th):
    lb=[]
    rb=[]
    for row in dt:
    #     print(row)
        v=row[col]
    #     print(v)
        if v<=th:
            lb.append(row)
        else:
            rb.append(row)
    return lb,rb

In [7]:
'''
For multi column data. Use random max features. Not all of them. [Random forest.]
'''
def find_best_split(rows, max_feature=-1):
    imp=gini(rows)
    ncol=len(rows[0])-1
    nrow=len(rows)
    best_col=0
    best_val=0
    best_ig=0

    
    columns=[i for i in range(ncol)]
    if max_feature>0:
        np.random.shuffle(columns)
        columns=columns[:max_feature]
    
    for ic in columns:
        for ir in range ( nrow ):
            th=rows[ir][ic]
            lb,rb=split_rows(rows, ic, th)
            ig=info_gain(lb, rb, imp)
    #         print(ig)
            if ig > best_ig:
                best_ig=ig
                best_col=ic
                best_val=th
    return best_col, best_val, best_ig

In [51]:
ncol=10
columns=[i for i in range(ncol)]
print(columns)
np.random.shuffle(columns)
print(columns)
columns=columns[:6]
print(columns)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0, 5, 2, 4, 7, 8, 1, 9, 3, 6]
[0, 5, 2, 4, 7, 8]


In [8]:
# ncol=10
# max_feature=6
# columns=[i for i in range(ncol)]

# if max_feature>0:
#     np.random.shuffle(columns)
#     columns=columns[:max_feature]
# print(columns)
# for ic in columns:
#     print(ic)

In [9]:
'''
Return the class that appeared max time
'''
def decide_class(rows):
    mc=count_class_freq(rows)
    ss= sorted(mc.items(), key=lambda kv: kv[1])
    return ss[-1][0]

In [10]:
'''
Recursively build the decision tree. Stop when all leaf node found or max depth.
'''
def make_tree(data, max_depth=-1, ndepth=1, max_feature=-1):
    best_col, best_val, best_ig=find_best_split(data, max_feature)
    if best_ig==0 or ndepth==max_depth: #leaf node.
        return decide_class(data)
    
    lb,rb=split_rows(data, best_col, best_val)
    lt=make_tree(lb, max_depth, ndepth+1)
    rt=make_tree(rb, max_depth, ndepth+1)
    return {'col': best_col, 'val':best_val, 'left':lt, 'right':rt}

In [11]:
def predict(tree, row):
    if not isinstance(tree, dict):
        return tree
    col=tree['col']
    val=tree['val']
    if row[col]<=val:
        return predict(tree['left'], row)
    else:
        return predict(tree['right'], row)

### Random Forest

In [12]:
# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import numpy as np

In [13]:
# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

In [14]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

In [15]:
seed(2)
filename = 'sonar.all-data.csv'
dataset = load_csv(filename)
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)
str_column_to_int(dataset, len(dataset[0])-1)

{'R': 0, 'M': 1}

In [43]:
X=np.array(dataset)
print(X.shape)

(208, 61)


In [44]:
X[0]

array([0.02  , 0.0371, 0.0428, 0.0207, 0.0954, 0.0986, 0.1539, 0.1601,
       0.3109, 0.2111, 0.1609, 0.1582, 0.2238, 0.0645, 0.066 , 0.2273,
       0.31  , 0.2999, 0.5078, 0.4797, 0.5783, 0.5071, 0.4328, 0.555 ,
       0.6711, 0.6415, 0.7104, 0.808 , 0.6791, 0.3857, 0.1307, 0.2604,
       0.5121, 0.7547, 0.8537, 0.8507, 0.6692, 0.6097, 0.4943, 0.2744,
       0.051 , 0.2834, 0.2825, 0.4256, 0.2641, 0.1386, 0.1051, 0.1343,
       0.0383, 0.0324, 0.0232, 0.0027, 0.0065, 0.0159, 0.0072, 0.0167,
       0.018 , 0.0084, 0.009 , 0.0032, 0.    ])

In [45]:
np.random.shuffle(X)
tst=X[-10:]
xtrain=X[:-10]
print(xtrain.shape)
print(tst.shape)

(198, 61)
(10, 61)


In [19]:
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
	sample = list()
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample

In [24]:
sample = subsample(xtrain, 0.6)
print(len(sample))

119


In [25]:
tree=make_tree(sample, max_feature=30)
print(tree)

{'col': 9, 'val': 0.1504, 'left': {'col': 1, 'val': 0.0615, 'left': {'col': 30, 'val': 0.3349, 'left': {'col': 55, 'val': 0.0049, 'left': 0.0, 'right': 1.0}, 'right': 0.0}, 'right': 1.0}, 'right': {'col': 36, 'val': 0.4773, 'left': {'col': 51, 'val': 0.0045, 'left': {'col': 1, 'val': 0.0119, 'left': 1.0, 'right': 0.0}, 'right': 1.0}, 'right': {'col': 21, 'val': 0.4856, 'left': 1.0, 'right': {'col': 15, 'val': 0.0422, 'left': 1.0, 'right': 0.0}}}}


In [46]:
trees=[]
ntree=5
for i in range(ntree):
    sample = subsample(xtrain, 0.6)
    tree=make_tree(sample, max_feature=40)
    trees.append(tree)

In [27]:
print(len(trees))

5


In [28]:
# trees

In [29]:
tst[0][60]

1.0

In [52]:
row=tst[0]
ps=[ predict(tree, row) for tree in trees]
print(ps)
res = max(set(ps), key = ps.count) 
print(res)

[1.0, 1.0, 0.0, 1.0, 0.0]
1.0


In [31]:
def bag_prediction(trees, row):
    ps=[ predict(tree, row) for tree in trees]
    return max(set(ps), key = ps.count) 

In [32]:
bag_prediction(trees, X[22])

1.0

In [33]:
prd=[]
for td in tst:
    p=bag_prediction(trees, td)
    prd.append(p)

In [34]:
print(prd)

[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]


In [35]:
yac=tst[:,-1]
print(yac)

[1. 0. 0. 0. 1. 0. 1. 1. 0. 1.]


In [36]:
accuracy_score(yac, prd)

0.9

### Scikitlearn exp

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [38]:
X, y = make_classification(n_samples=1000, n_features=4,  n_informative=2, n_redundant=0,  random_state=0, shuffle=False)

In [39]:
X.shape

(1000, 4)

In [40]:
clf = RandomForestClassifier(n_estimators=6, max_depth=2, max_features=3)
model=clf.fit(X, y)
print(model)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [41]:
# model.predict(X)