# Data Science for Business - Logit on German Credit

## Initialize notebook
Load required packages. Set up workspace, e.g., set theme for plotting and initialize the random number generator.

In [2]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, auc, RocCurveDisplay
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix

import statsmodels.api as sm
import statsmodels.formula.api as smf


In [3]:
np.random.seed(42)
plt.style.use('fivethirtyeight')

## Load data

Load training data from CSV file.

In [4]:
data = pd.read_csv('german_credit.csv')
data = data.drop(columns=['Unnamed: 0'])
data["Credit_amount"] = data["Credit_amount"]/1000

In [5]:
data.head(20)

Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose,Credit_risk
0,67,male,2,own,,little,1.169,6,radio/TV,0
1,22,female,2,own,little,moderate,5.951,48,radio/TV,1
2,49,male,1,own,little,,2.096,12,education,0
3,45,male,2,free,little,little,7.882,42,furniture/equipment,0
4,53,male,2,free,little,little,4.87,24,car,1
5,35,male,1,free,,,9.055,36,education,0
6,53,male,2,own,quite rich,,2.835,24,furniture/equipment,0
7,35,male,3,rent,little,moderate,6.948,36,car,0
8,61,male,1,own,rich,,3.059,12,radio/TV,0
9,28,male,3,own,little,moderate,5.234,30,car,1


In [6]:
data["Sex"].value_counts()

Sex
male      656
female    298
Name: count, dtype: int64

In [7]:
data["Housing"].value_counts()

Housing
own     680
rent    169
free    105
Name: count, dtype: int64

In [8]:
data.describe()

Unnamed: 0,Age,Job,Credit_amount,Duration,Credit_risk
count,954.0,954.0,954.0,954.0,954.0
mean,35.501048,1.909853,3.279112,20.780922,0.302935
std,11.379668,0.649681,2.853315,12.046483,0.459768
min,19.0,0.0,0.25,4.0,0.0
25%,27.0,2.0,1.36025,12.0,0.0
50%,33.0,2.0,2.3025,18.0,0.0
75%,42.0,2.0,3.97525,24.0,1.0
max,75.0,3.0,18.424,72.0,1.0


In [9]:
model_logit = smf.logit(formula='Credit_risk ~ Age + Sex + Housing + Credit_amount*Duration', data=data)
model_logit = model_logit.fit()

Optimization terminated successfully.
         Current function value: 0.576899
         Iterations 5


In [10]:
print(model_logit.summary())

                           Logit Regression Results                           
Dep. Variable:            Credit_risk   No. Observations:                  954
Model:                          Logit   Df Residuals:                      946
Method:                           MLE   Df Model:                            7
Date:                Fri, 28 Feb 2025   Pseudo R-squ.:                 0.05940
Time:                        10:46:27   Log-Likelihood:                -550.36
converged:                       True   LL-Null:                       -585.12
Covariance Type:            nonrobust   LLR p-value:                 1.854e-12
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0.6675      0.461     -1.448      0.148      -1.571       0.236
Sex[T.male]               -0.3995      0.162     -2.470      0.014      -0.716      -0.083
Hous

In [None]:
# create a record with the following values
# Age = 32
# Sex = female
# Housing = own
# Credit_amount = 2
# Duration = 12
# the probability of getting a credit risk

record = {'Age': 32, 'Sex': "female", 'Housing': "own", 'Credit_amount': 2, 'Duration': 12}
record = pd.DataFrame(record, index=[0])
record

model_logit.predict(record)

0    0.256041
dtype: float64