In [432]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Get the data

In [439]:
loans = pd.read_csv("./lending-club-data.csv")

In [441]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features + [target]]

In [442]:
for feature in loans.columns:
    print(feature, loans[feature].unique())

('grade', array(['B', 'C', 'A', 'E', 'F', 'D', 'G'], dtype=object))
('term', array([' 36 months', ' 60 months'], dtype=object))
('home_ownership', array(['RENT', 'OWN', 'MORTGAGE', 'OTHER'], dtype=object))
('emp_length', array(['10+ years', '< 1 year', '3 years', '9 years', '4 years',
       '5 years', '1 year', '6 years', '2 years', '7 years', '8 years',
       'n/a'], dtype=object))
('safe_loans', array([ 1, -1]))


### Categorical transformation

In [443]:
loans_cat = pd.get_dummies(loans)

In [444]:
loans_cat.head(5)

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


### Data balancing

In [446]:
loans_cat.groupby(["safe_loans"]).count()

Unnamed: 0_level_0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
safe_loans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,23150,23150,23150,23150,23150,23150,23150,23150,23150,23150,...,23150,23150,23150,23150,23150,23150,23150,23150,23150,23150
1,99457,99457,99457,99457,99457,99457,99457,99457,99457,99457,...,99457,99457,99457,99457,99457,99457,99457,99457,99457,99457


In [447]:
unsafe = loans_cat[loans_cat.safe_loans == -1]
safe = loans_cat[loans_cat.safe_loans == 1].sample(n = len(unsafe))

In [448]:
data = pd.concat([unsafe,safe]).sample(frac=1).reset_index().drop("index",axis=1)

In [449]:
data.head()

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
0,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,-1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


### Create decision tree

In [306]:
class Node:
    def __init__(self, split_feature):
        self.children = [None, None]
        self.split_feature = split_feature
        self.isLeaf = None
        self.prediction = None
        return
    
    def makeLeaf(self, data, target):
        self.isLeaf = True 
        if len(data[data[target] == 1]) > len(data[data[target] == -1]):
            self.prediction = 1
        else:
            self.prediction = -1
        return

In [392]:
class DecisionTree:
    def __init__(self, data, features, target, max_depth=10, groups = [0,1], classes = [-1,1]):
        self.groups = groups
        self.classes = classes
        self.target = target
        self.max_depth = max_depth
        self.root = self.__getTree(data, features, 0)
        return

    def printTreeLevelOrder(self):
        """
        Use the level order tree traversal to print the tree
        """
        root = self.root
        queue = [root, ""]
        levels = [[]]
        while len(queue):
            root = queue[0]
            queue.pop(0)

            if root == "":
                if len(queue):
                    queue.append("")
                levels.append([])
                continue
            levels[-1].append(root.split_feature)
            if root.children[0]:
                queue.append(root.children[0])
            if root.children[1]:
                queue.append(root.children[1])
        for level in levels:
            print(level)

    def __getTree(self, data, features, depth):
        remaining_features = features[:]
        best_feature = self.getFeatureToSplit(data, remaining_features)
        remaining_features.remove(best_feature)
        node = Node(best_feature)

        #1. No more features left
        if not len(remaining_features):
            node.makeLeaf(data,target)
            return node
        #2. Only one class left in the data
        #3. Check for depth
        if not len(data[data[self.target]==1]) or not len(data[data[self.target]==-1]) or depth >= self.max_depth:
            node.makeLeaf(data,self.target)
            return node

        #pre-order
        node.children[0] = self.__getTree(data[data[best_feature]==0], remaining_features, depth+1)
        node.children[1] = self.__getTree(data[data[best_feature]==1], remaining_features, depth+1)
        return node

    def getFeatureToSplit(self, data, features):
        min_error = 999999999
        min_error_feature = None
        for feature in features:
            error = 0.0
            for group in self.groups:
                error += computeError(data[data[feature] == group], self.target)
            if error < min_error:
                min_error = error
                min_error_feature = feature
        return min_error_feature

    def computeError(self, data):
        return min(len(data[data[self.target]==1]),len(data[data[self.target]==-1]))
    
    def predict(self, x, node=None):
        """
        input: x : dataframe of a sample
        output: predict_class
        """
        if not node:
            node = self.root
        if node.isLeaf:
            return node.prediction
        split_feature_value = x[node.split_feature]
        if split_feature_value == 0:
            return self.predict(x, node.children[0])
        else:
            return self.predict(x, node.children[1])

### Train decision tree

In [393]:
X_train, X_test, Y_train, Y_test = train_test_split(data.drop("safe_loans",axis=1), data["safe_loans"],test_size=.1,random_state=42)
target = "safe_loans"
features = list(X_train.columns)
#features.remove("safe_loans")
X_train[target] = Y_train

In [404]:
tree = DecisionTree(X_train, features, target, 3)

### Predict

In [405]:
predictions = []
for i, row in X_test.iterrows():
    predictions.append(tree.predict(row))

In [400]:
accuracy_score(predictions, Y_test)

0.6058315334773218

### HP tuning

In [None]:
tree = DecisionTree(X_train, features, target)

In [403]:
accuracy_score(predictions, Y_test)

0.6157667386609071

### Evaluate your model

In [413]:
from sklearn import tree

In [414]:
clf = tree.DecisionTreeClassifier()

In [419]:
clf.fit(X_train.drop("safe_loans",axis=1), Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [420]:
accuracy_score(clf.predict(X_test), Y_test)

0.6123110151187905

In [428]:
clf = RandomForestClassifier(n_estimators=1000,random_state=0)

In [429]:
clf.fit(X_train.drop("safe_loans",axis=1), Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [430]:
accuracy_score(clf.predict(X_test), Y_test)

0.6110151187904967