# Estimated Loss Calculation using Logistic Regression
#### First step, we load the data

In [60]:
import pandas as pd
import numpy as np

In [61]:
df = pd.read_csv('Loan_Data_CSV.csv')
loan_status = df['total_debt_outstanding'].values
outputs = df['default'].values
features = df.drop(['customer_id', 'default'], axis=1).values
weights = np.zeros(features.shape[1])
bias = 0

#### Standardizing features transforms them to a common scale using: 
$$ x_{standardized} = \frac{x - \mu}{\sigma} $$

In [62]:
def rescale(data):
    return (data - data.mean(axis=0)) / data.std(axis=0)

features = rescale(features)

#### Then, we define our `prediction` and `cost function`

In [63]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predictions(weights, bias, inputs):
    return sigmoid(np.dot(inputs, weights) + bias)

In [64]:
def cost_function(weights, inputs, outputs):
    predictions = sigmoid(np.dot(inputs, weights))
    print(predictions)
    return -np.mean(outputs * np.log(predictions) + (1 - outputs) * np.log(1 - predictions))

#### Using the derivative of the `cost function` above, we can calculate the gradient of each weights and bias, hence using gradient descent to calculate the best params for our logistic regression

In [65]:
def gradient_descent(weights, bias, inputs, outputs, learning_rate, epochs):
    for epoch in range(epochs):
        predict = predictions(weights, bias, inputs)
        
        errors = predict - outputs
        dj_dw = np.dot(inputs.T, errors) / len(inputs)
        dj_db = np.mean(errors)

        weights -= learning_rate * dj_dw
        bias -= learning_rate * dj_db
        if epoch % 10000 == 0:
            print(cost_function(weights, inputs, outputs))

    return weights, bias

weights, bias = gradient_descent(weights, bias, features, outputs, 0.001, 50000)

[0.49989323 0.50022532 0.49987791 ... 0.4999864  0.49987716 0.49980898]
0.6929184720911541
[0.18427588 0.96589284 0.20955549 ... 0.56351078 0.25394885 0.19901989]
0.3837800127141726
[0.12649755 0.98814214 0.1627092  ... 0.6068239  0.22775393 0.17981914]
0.36551958408053176
[0.09860228 0.99406124 0.13810331 ... 0.63559076 0.21474521 0.17111878]
0.3600347237683505
[0.08133544 0.99650149 0.12146896 ... 0.65717624 0.20544796 0.16497969]
0.35827032228239125


#### After obtaining the weights and bias, we calculate the accuracy of our logistic regression

In [69]:
def accuracy(weights, bias, inputs, outputs):
    predict = predictions(weights, bias, inputs)
    predict = np.round(predict)
    return np.mean(1 - (predict - outputs))*100

print(f"Accuracy is {accuracy(weights, bias, features, outputs)}%")

Accuracy is 99.59%


#### The accuracy is 99.6%, which is quite good
Now, based on our calculation, we can calculate the estimated loss

In [67]:
def calculate_estimated_loss(weights, bias, inputs):
    predicts = predictions(weights, bias, inputs)
    # Assume that the recovery rate is 10% of the total debt outstanding
    estimated_loss = predicts * 0.9 * loan_status
    return np.round(np.sum(estimated_loss))
    
print(calculate_estimated_loss(weights, bias, features))

33358621.0
