# German Credit Risk Data

Data from https://www.kaggle.com/mpwolke/cusersmarildownloadsgermancsv

- account_balance
    - 1 -> no account
    - 2 -> none
    - 3 -> below 200 dm
    - 4 -> above 200 dm
- payment_status
    - 0 -> delayed
    - 1 -> other credits
    - 2 -> paid up
    - 3 -> no problem with current credits
    - 4 -> previous credit paid
- purpose
    - 0 -> other
    - 1 -> new car
    - 2 -> used car
    - 3 -> furniture
    - 4 -> radio/TV
    - 5 -> appliances
    - 6 -> repair
    - 7 -> vacation
    - 8 -> retraining
    - 9 ->  business
- value_savings_stocks
    - 0 -> none
    - 1 -> bellow 100 dm
    - 2 -> \[100, 500\)
    - 3 -> \[500, 1000\)
    - 4 -> above 1000
- length_of_current_employment
    - 1 -> unemployed
    - 2 -> < 1 year
    - 3 -> [1, 4)
    - 4 -> [4, 7)
    - 5 -> above 7
- instalment_percent
    - 1 -> above 35%
    - 2 ->  (25\%, 35\%)
    - 3 ->  [20\%, 25\%)
    - 4 -> below 20\%
- sex_marital_status
    - 1 -> male, divorces
    - 2 -> male, single
    - 3 -> male, married/widowed
    - 4 -> female
- duration_in_current_address    
    - 1 -> < 1 year
    - 2 -> [1, 4)
    - 3 -> [4, 7)
    - 4 -> above 7 
- most_valuable_available_asset
    - 1 -> none 
    - 2 -> car
    - 3 -> life insurance
    - 4 -> real estate
- type_of_apartment
    - 1 -> free 
    - 2 -> rented
    - 3 -> owned
- no_of_credits_at_this_bank
    - 1 -> 1 
    - 2 -> 2 or 3
    - 3 -> 4 or 5
    - 4 -> above 6
- occupation
    - 1 -> unemployed, unskilled
    - 2 -> unskilled Permanent Resident
    - 3 -> skilled
    - 4 -> executive
- no_of_dependents
    - 1 -> 3 or more
    - 2 -> less than 3
- foreign_worker
    - 1 -> no
    - 2 -> yes

In [1]:
import pandas as pd

In [91]:
orderred_categorical_columns = ['account_balance', 'duration_of_credit_monthly', 'value_savings_stocks', 'length_of_current_employment', 'instalment_percent', 
                                'duration_in_current_address', 'no_of_credits_at_this_bank', 'no_of_dependents']
unorderred_categorical_columns = ['payment_status_of_previous_credit', 'purpose', 'sex_marital_status', 'most_valuable_available_asset', 'type_of_apartment', 'occupation']

In [98]:
german = (
    pd.read_csv('raw-data/german.csv', sep=';')
        .rename(str.lower, axis='columns')
        .rename(columns= {'instalment_per_cent': 'instalment_percent'})
        .assign(telephone = lambda df: df.telephone == 2)
        .assign(foreign_worker = lambda df: df.foreign_worker == 2)
        .apply(lambda col:  col.name in unorderred_categorical_columns if col.astype('category') else col, axis=0)
)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [80]:
german.head()

Unnamed: 0,creditability,account_balance,duration_of_credit_monthly,payment_status_of_previous_credit,purpose,credit_amount,value_savings_stocks,length_of_current_employment,instalment_percent,sex_marital_status,...,duration_in_current_address,most_valuable_available_asset,age_years,concurrent_credits,type_of_apartment,no_of_credits_at_this_bank,occupation,no_of_dependents,telephone,foreign_worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,False,False
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,False,False
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,False,False
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,False,True
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,False,True


In [83]:
german.dtypes

creditability                        category
account_balance                      category
duration_of_credit_monthly           category
payment_status_of_previous_credit    category
purpose                              category
credit_amount                        category
value_savings_stocks                 category
length_of_current_employment         category
instalment_percent                   category
sex_marital_status                   category
guarantors                           category
duration_in_current_address          category
most_valuable_available_asset        category
age_years                            category
concurrent_credits                   category
type_of_apartment                    category
no_of_credits_at_this_bank           category
occupation                           category
no_of_dependents                     category
telephone                            category
foreign_worker                       category
dtype: object

In [14]:
german.describe()

Unnamed: 0,creditability,account_balance,duration_of_credit_monthly,payment_status_of_previous_credit,purpose,credit_amount,value_savings_stocks,length_of_current_employment,instalment_per_cent,sex_marital_status,...,duration_in_current_address,most_valuable_available_asset,age_years,concurrent_credits,type_of_apartment,no_of_credits_at_this_bank,occupation,no_of_dependents,telephone,foreign_worker
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.7,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,...,2.845,2.358,35.542,2.675,1.928,1.407,2.904,1.155,1.404,1.037
std,0.458487,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,...,1.103718,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856
min,0.0,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,...,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,...,2.0,1.0,27.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
50%,1.0,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,...,3.0,2.0,33.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
75%,1.0,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,...,4.0,3.0,42.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0
max,1.0,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,...,4.0,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0


In [49]:
german.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   creditability                      1000 non-null   int64
 1   account_balance                    1000 non-null   int64
 2   duration_of_credit_monthly         1000 non-null   int64
 3   payment_status_of_previous_credit  1000 non-null   int64
 4   purpose                            1000 non-null   int64
 5   credit_amount                      1000 non-null   int64
 6   value_savings_stocks               1000 non-null   int64
 7   length_of_current_employment       1000 non-null   int64
 8   instalment_per_cent                1000 non-null   int64
 9   sex_marital_status                 1000 non-null   int64
 10  guarantors                         1000 non-null   int64
 11  duration_in_current_address        1000 non-null   int64
 12  most_valuable_availab

In [77]:
german.instalment_per_cent.value_counts().sort_index() / 1000

1    0.136
2    0.231
3    0.157
4    0.476
Name: instalment_per_cent, dtype: float64

In [76]:
german.foreign_worker.name

'foreign_worker'

In [92]:
'foreign_worker' in unorderred_categorical_columns

False