In [None]:
import pandas as pd
import numpy as np
import random
import math

In [None]:
global_encoder_by_field = {}
def get_encoding(column):
    column = column.astype('category')
    encoding = {}
    for i, category in enumerate(column.cat.categories):
        encoding[category] = i
    global_encoder_by_field[column.name] = encoding
    return column.cat.codes

In [None]:
filename = 'dating-full.csv'
dating = pd.read_csv(filename)
dating = dating.head(6500)
cols_to_delete = ['race','race_o','field']
for col in cols_to_delete:
    del dating[col]

dating[['gender']] = dating[['gender']].apply(get_encoding)

partner_cols = ['pref_o_attractive','pref_o_sincere','pref_o_intelligence','pref_o_funny','pref_o_ambitious','pref_o_shared_interests']
participant_cols = ['attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important']  

total_partner = 0
total_participant = 0 

for i in range (0,6):
    total_partner += dating[partner_cols[i]]
    total_participant += dating[participant_cols[i]] 

for i in range(0,6):
    dating[partner_cols[i]]/=total_partner
    dating[participant_cols[i]]/=total_participant

for i in range(0,6):
    participant_mean = dating[participant_cols[i]].sum()/len(dating[participant_cols[i]])
#     print ('Mean of ', participant_cols[i], ':', round(participant_mean, 2))
for i in range(0,6): 
    partner_mean = dating[partner_cols[i]].sum()/len(dating[partner_cols[i]])
#     print ('Mean of ', partner_cols[i], ':', round(partner_mean, 2))

In [None]:
non_binned_cols = ['gender', 'race', 'race_o', 'samerace', 'field', 'decision']   
for column in dating:
    if column not in non_binned_cols:
        dating[column] = pd.cut(dating[column], 2, labels = [0,1])

In [None]:
def split_dataset(t_frac, random_state, dataset):
    '''
    split dataset
    '''
    testset=dataset.sample(frac=t_frac,random_state=random_state)
    trainset=dataset.drop(testset.index)
    testset.to_csv("testSet.csv", index = False)
    trainset.to_csv("trainingSet.csv", index = False)

In [None]:
split_dataset(0.2, 47, dating)

In [None]:
class Node:

    def __init__(self, attr, predicted_label):

        self.left = None
        self.right = None
        self.attr = attr
        self.predicted_label = predicted_label


    def PrintTree(self):
        
        if self.left:
            self.left.PrintTree()
        print (self.attr, " ", self.predicted_label)
        if self.right:
            self.right.PrintTree()

In [None]:
LABELS_LEN = 2
MAX_DEPTH = 8

In [None]:
def find_gini(count_array, total_count):
    gini = 1;
    sum_prob = 0
    if total_count <= 0:
        return gini
    for i in range(LABELS_LEN):
        if i not in count_array:
            count_array[i]=0
        prob_i = count_array[i]/(1.0*total_count)
        if prob_i != 0:
            sum_prob += prob_i
            gini -= prob_i**2
    return gini

def calculate_gini(labels):
    return find_gini(labels.value_counts(), len(labels))

In [None]:
def get_features_labels(dataset):
    features = dataset.drop(columns = ['decision'])
    labels = dataset['decision']
    return features, labels

In [None]:
def predict_label(labels):
    counts = labels.value_counts()
    predicted_label = counts.idxmax()
    confidence = counts[predicted_label]/(1.0*len(labels))

    return predicted_label, confidence

In [None]:
def count_branch(attr, attr_value, dataframe):
    count_array = dataframe[dataframe[attr]==attr_value]["decision"].value_counts()
    return count_array, np.sum(count_array)

In [None]:
def calculate_gini_gain(attr, dataframe, gini_sample):
    count_array_left, total_eg_left = count_branch(attr, 0, dataframe)
    count_array_right, total_eg_right = count_branch(attr, 1, dataframe)
    
    gini_left = find_gini(count_array_left, total_eg_left)
    gini_right = find_gini(count_array_right, total_eg_right)
    
    gini_gain = gini_sample - (gini_left*total_eg_left + gini_right*total_eg_right)/(total_eg_left+total_eg_right)
    return gini_gain

In [None]:
def get_inference(root, test_features, test_labels):
    correct_points = 0
    predictions = []
    for i in range(len(test_features)):
        predicted_label = get_label_decision_tree(root, test_features.iloc[i])
        
        if test_labels[i] == predicted_label:
            correct_points += 1
        predictions.append(predicted_label)

    accuracy = correct_points*100.0/len(test_labels)
    return accuracy

In [None]:
def get_label_decision_tree(root, test_point):
    if root.attr is None:
        return root.predicted_label

    if test_point[root.attr] == 0:
        if root.left is not None:
            return get_label_decision_tree(root.left, test_point)
    if root.right is not None:
        return get_label_decision_tree(root.right, test_point)
    return root.predicted_label

In [None]:
trainingSet = pd.read_csv("trainingSet.csv")
testSet = pd.read_csv("testSet.csv")
training_features, training_labels = get_features_labels(trainingSet)
test_features, test_labels = get_features_labels(testSet)

In [None]:
def create_decision_tree(trainingSet, depth, is_random_forest, excluded_features):    
    predicted_label, confidence = predict_label(trainingSet['decision'])
    if depth >= MAX_DEPTH or confidence == 100:
#         print (predicted_label, " with confidence ", confidence, " depth is ", depth)
        return Node(None, predicted_label)
    
    if len(trainingSet) <= 50:
        return Node(None, predicted_label)

    gini_sample = calculate_gini(trainingSet['decision'])
    columns = trainingSet.columns
    columns = columns.difference(excluded_features)
    if is_random_forest:
        num_samples = int(math.sqrt(len(columns)))
        columns = random.sample(set(columns), num_samples)
        
    max_gini_gain = -100
    max_split_attr = -100
    for attr in columns:
            gini_gain = calculate_gini_gain(attr, trainingSet, gini_sample)
            if gini_gain > max_gini_gain:
                max_gini_gain = gini_gain
                max_split_attr = attr
#     print (max_gini_gain, " attr ", max_split_attr)

    left_trainingSet = trainingSet[trainingSet[max_split_attr]==0]
    right_trainingSet = trainingSet[trainingSet[max_split_attr]==1]

    node = Node(max_split_attr, predicted_label)

    if len(left_trainingSet) > 0:
        excluded_features.add(max_split_attr)
        node.left = create_decision_tree(left_trainingSet, depth+1, is_random_forest, excluded_features)
        excluded_features.remove(max_split_attr)
    if len(right_trainingSet) > 0:
        excluded_features.add(max_split_attr)
        node.right = create_decision_tree(right_trainingSet, depth+1, is_random_forest, excluded_features)
        excluded_features.remove(max_split_attr)
    return node

In [None]:
root = create_decision_tree(trainingSet, 0, False, {"decision"})

In [None]:
get_inference(root, test_features, test_labels)

In [None]:
get_inference(root, training_features, training_labels)

In [None]:
root.PrintTree()

In [None]:
a=[1,2,3]
b=np.array([3,2,5])
np.sum(a==b)

In [None]:
def do_inference_bagging(baggedTrees, test_features, test_labels):
    predicted_labels = []
    for i in range(len(test_features)):
        predicted_label_array = []
        for j in range(len(baggedTrees)):
            predicted_label_i = get_label_decision_tree(baggedTrees[j], test_features.iloc[i])
            predicted_label_array.append(predicted_label_i)
        predicted_label = max(predicted_label_array, key = predicted_label_array.count)
        
        predicted_labels.append(predicted_label)
    correct_labels = np.sum(predicted_labels == test_labels)
    accuracy = 100*correct_labels/(1.0*len(test_labels))
    return predicted_labels, accuracy

In [None]:
def create_bagged_trees(trainingSet):
    baggedTrees = []
    for i in range(30):
        sampledSet= trainingSet.sample(frac=1,replace=True)
        root = create_decision_tree(sampledSet, 0, False, {"decision"})
        baggedTrees.append(root)
    return baggedTrees

In [None]:
def bagging(trainingSet, testSet):
    baggedTrees = create_bagged_trees(trainingSet)
    predictions, accuracy = do_inference_bagging(baggedTrees, test_features, test_labels)

In [None]:
baggedTrees = create_bagged_trees(trainingSet)

In [None]:
test_predictions, test_accuracy = do_inference_bagging(baggedTrees, test_features, test_labels)
print(test_accuracy)
train_predictions, train_accuracy = do_inference_bagging(baggedTrees, training_features, training_labels)
print(train_accuracy)

In [None]:
trainingSet.columns

In [None]:
# col=trainingSet.columns
# col = col
# # print(col)
# r={"decision","age1"}
# col=col.difference(r)
# num = int(math.sqrt(len(col)))
# col=random.sample(set(col), num)
# print (col)

In [None]:
# def adhoc_recursion(l, side, depth):
#     if depth == 3:
#         print(l)
#         return
#     if side:
#         l.append(1)
#     else:
#         l.append(2)
#     adhoc_recursion(l, True, depth+1)
#     del l[-1]
#     adhoc_recursion(l, False, depth+1)
#     del l[-1]
    
# l=[]
# side = True
# adhoc_recursion(l,side, 1)

In [None]:
def create_random_forest(trainingSet):
    rfTrees = []
    for i in range(30):
        sampledSet= trainingSet.sample(frac=1,replace=True)
        root = create_decision_tree(sampledSet, 0, True, {"decision"})
        rfTrees.append(root)
    return rfTrees

In [None]:
rfTrees =create_random_forest(trainingSet)

In [None]:
test_predictions, test_accuracy=do_inference_bagging(rfTrees, test_features, test_labels)

In [None]:
test_accuracy

In [None]:
train_predictions, train_accuracy=do_inference_bagging(rfTrees, training_features, training_labels)

In [None]:
train_accuracy