In [1]:
import pandas
from banker import run
from bankers.randomforestbanker import RandomForestBanker
from bankers.logisticbanker import LogisticBanker
from bankers.kneighborsbanker import KNeighborsBanker
from bankers.randombanker import RandomBanker
from bankers.yesbanker import YesBanker

In [2]:
features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign']
target = 'repaid'
df = pandas.read_csv('../../data/credit/german.data', sep=' ',
                     names=features+[target])
import matplotlib.pyplot as plt
numerical_features = ['duration', 'age', 'residence time', 'installment', 'amount', 'duration', 'persons', 'credits']
quantitative_features = list(filter(lambda x: x not in numerical_features, features))
X = pandas.get_dummies(df, columns=quantitative_features, drop_first=True)
encoded_features = list(filter(lambda x: x != target, X.columns))

## Test function
def test_decision_maker(X_test, y_test, interest_rate, decision_maker):
    action = decision_maker.get_best_action(X_test)
    loss = X_test['amount']
    gain = X_test['amount']*((1 + interest_rate)**(X_test['duration']) - 1)
    utility = sum(gain[((action == 1) & (y_test == 1))]) - sum(loss[((action == 1) & (y_test == 2))])
    return utility

In [3]:
def model_check(X, decision_maker, interest_rate):
    from sklearn.model_selection import train_test_split
    n_tests = 100
    utility = 0
    for iter in range(n_tests):
        print(iter, end="%\r")
        X_train, X_test, y_train, y_test = train_test_split(X[encoded_features], X[target], test_size=0.2)
        decision_maker.fit(X_train, y_train)
        utility += test_decision_maker(X_test, y_test, interest_rate, decision_maker)
    print(utility / n_tests)

In [4]:
interest_rate = 0.005

print("RandomBanker: ")
model_check(X,
            RandomBanker(interest_rate=interest_rate),
            interest_rate=interest_rate)

print("YesBanker: ")
model_check(X,
            YesBanker(interest_rate=interest_rate),
            interest_rate=interest_rate)

print("LogisticBanker: ")
model_check(X,
            LogisticBanker(interest_rate=interest_rate),
            interest_rate=interest_rate)

for k in [1, 5, 15, 25, 35]:
    print(f"KNeighborsBanker with k={k}:")
    model_check(X,
                KNeighborsBanker(interest_rate=interest_rate, k=k),
                interest_rate=interest_rate)

for n in range(25, 151, 25):
    print(f"RandomForestBanker with estimators={n}:")
    model_check(X,
                RandomForestBanker(interest_rate=interest_rate, n_estimators=n),
                interest_rate=interest_rate)

RandomBanker: 
-93907.17059919944
YesBanker: 
-178966.71787845046
LogisticBanker: 
4438.293373158159
KNeighborsBanker with k=1:
-77746.61194827217
KNeighborsBanker with k=5:
-9452.153891927639
KNeighborsBanker with k=15:
2153.596787354184
KNeighborsBanker with k=25:
1943.2497813399598
KNeighborsBanker with k=35:
1723.0386889898793
RandomForestBanker with estimators=25:
4825.266649094837
RandomForestBanker with estimators=50:
5795.888514158417
RandomForestBanker with estimators=75:
5531.171435339596
RandomForestBanker with estimators=100:
5273.536161495547
RandomForestBanker with estimators=125:
5754.232970240754
RandomForestBanker with estimators=150:
5919.517108775044


In [5]:
# n_estimators = 100 and k = 15 by default
run()

RandomForestBanker (594.6725331071639, 1240.7540590972114)
LogisticBanker (501.32402229703456, 2796.1105655138745)
KNeighborsBanker (226.19090214905282, 595.6521694875348)
RandomBanker (-7056.4158900215525, 11852.850343165503)
YesBanker (-17924.25071484847, 13642.810814722221)


We get the best results with `RandomForestBanker` using both `TestLending.py` and `banker.py`. `KNeighborsBanker` standardizes the variables.