In [1]:
import os

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Read recipe inputs
german_df = pd.read_csv("german_credit.csv")
german_df = german_df.drop(columns=['col_0'])
german_df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [2]:
cols = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking', 'credit_amt', 'duration', 'purpose', 'outcomes']
german_df['under_25'] = german_df['Age'].apply(lambda x : True if x < 25 else False)
german_df['sex'] = german_df['Sex'].apply(lambda x : x.capitalize())
german_df['job'] = german_df['Job'].apply(lambda x : 'None' if x == 0 else '1 Job' if x == 1 else 'Multiple Jobs')
german_df['housing'] = german_df['Housing'].apply(lambda x : x.capitalize())
german_df['savings'] = german_df['Saving accounts'].apply(lambda x : x.capitalize() if x == x else 'N/A')
german_df['checking'] = german_df['Checking account'].apply(lambda x : x.capitalize() if x == x else 'N/A')
credit_amts = {0:'Low', 1:'Moderate', 2:'High', 3:'Very High'}
german_df['credit_amt'] = np.digitize(german_df['Credit amount'], [1000, 3000, 10000])
german_df['credit_amt'] = german_df['credit_amt'].apply(lambda x : credit_amts[x])
duration_lengths = {0:'Very Short', 1:'Short', 2:'Moderate', 3:'Long', 4:'Very Long'}
german_df['duration'] = np.digitize(german_df['Duration'], [6, 12, 24, 48])
german_df['duration'] = german_df['duration'].apply(lambda x : duration_lengths[x])
german_df['purpose'] = german_df['Purpose'].apply(lambda x : '/'.join([a.capitalize() if a != 'TV' else a for a in x.split('/')]))
german_df['outcomes'] = german_df['Risk'].apply(lambda x : 1 if x == 'bad' else 0) # take outcome = 1 to mean bad
german_prep_df = german_df[cols]
german_prep_df

Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes
0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0
1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1
2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0
3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0
4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1
...,...,...,...,...,...,...,...,...,...,...
995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0
996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0
997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0
998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1


In [3]:
print("Summary of all columns of the dataset:")
for col in german_prep_df.columns:
    print("\n")
    print(german_prep_df.loc[:,col].value_counts())
german_prep_df.head(10)

Summary of all columns of the dataset:


False    851
True     149
Name: under_25, dtype: int64


Male      690
Female    310
Name: sex, dtype: int64


Multiple Jobs    778
1 Job            200
None              22
Name: job, dtype: int64


Own     713
Rent    179
Free    108
Name: housing, dtype: int64


Little        603
N/A           183
Moderate      103
Quite rich     63
Rich           48
Name: savings, dtype: int64


N/A         394
Little      274
Moderate    269
Rich         63
Name: checking, dtype: int64


Moderate     504
High         340
Low          116
Very High     40
Name: credit_amt, dtype: int64


Moderate      406
Long          350
Short         173
Very Long      64
Very Short      7
Name: duration, dtype: int64


Car                    337
Radio/TV               280
Furniture/Equipment    181
Business                97
Education               59
Repairs                 22
Domestic appliances     12
Vacation/Others         12
Name: purpose, dtype: int64


0    700
1

Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes
0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0
1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1
2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0
3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0
4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1
5,False,Male,1 Job,Free,,,High,Long,Education,0
6,False,Male,Multiple Jobs,Own,Quite rich,,Moderate,Long,Furniture/Equipment,0
7,False,Male,Multiple Jobs,Rent,Little,Moderate,High,Long,Car,0
8,False,Male,1 Job,Own,Rich,,High,Moderate,Radio/TV,0
9,False,Male,Multiple Jobs,Own,Little,Moderate,High,Long,Car,1


### Build Models on Whole Dataset

Select only covariates and outcomes to prepare for simulation.

One-Hot encode categorical features.

In [4]:
german_model_df = german_prep_df

In [5]:
for cat_col in german_model_df.select_dtypes(include=['object', 'bool']).columns:
    german_model_df[cat_col] = german_model_df[cat_col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
X_num = german_model_df.select_dtypes(exclude=['category'])
X_cat = german_model_df.select_dtypes(include=['category'])

In [7]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
german_model_df = pd.concat(frames, axis=1)
german_model_df

Unnamed: 0,under_25_False,under_25_True,sex_Female,sex_Male,job_1 Job,job_Multiple Jobs,job_None,housing_Free,housing_Own,housing_Rent,...,duration_Very Short,purpose_Business,purpose_Car,purpose_Domestic appliances,purpose_Education,purpose_Furniture/Equipment,purpose_Radio/TV,purpose_Repairs,purpose_Vacation/Others,outcomes
0,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
2,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
996,1,0,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
997,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
998,0,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [8]:
extra_cols = ['under_25_False', 'sex_Female', 'job_None', 'housing_Free', 'savings_N/A', 'checking_N/A', 'credit_amt_Low', 'duration_Short']
german_model_df = german_model_df.drop(columns=extra_cols)
german_model_df

Unnamed: 0,under_25_True,sex_Male,job_1 Job,job_Multiple Jobs,housing_Own,housing_Rent,savings_Little,savings_Moderate,savings_Quite rich,savings_Rich,...,duration_Very Short,purpose_Business,purpose_Car,purpose_Domestic appliances,purpose_Education,purpose_Furniture/Equipment,purpose_Radio/TV,purpose_Repairs,purpose_Vacation/Others,outcomes
0,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
996,0,1,0,1,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
997,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
998,1,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [9]:
X = german_model_df.drop(columns=['outcomes'])
y = german_model_df['outcomes']

In [10]:
# Build Logistic Regression
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
german_model_df['proba_lr'] = proba
german_prep_df['proba_lr'] = proba
german_prep_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes,proba_lr
0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0,0.114175
1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1,0.612611
2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0,0.154985
3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0,0.545431
4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1,0.574045
...,...,...,...,...,...,...,...,...,...,...,...
995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0,0.136923
996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0,0.517829
997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0,0.135555
998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1,0.550995


In [11]:
# Check accuracy of Logistic Regression
german_model_df['test_outcomes'] = german_prep_df['proba_lr'].apply(lambda x : 1 if x > 0.5 else 0)
german_model_df['check'] = german_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(german_model_df['check']) / len(german_model_df))
german_model_df = german_model_df.drop(columns=['test_outcomes', 'check'])

0.747


In [12]:
# Build Random Forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X, y)
proba = rf.predict_proba(X)[:,1]
german_model_df['proba_rf'] = proba
german_prep_df['proba_rf'] = proba
german_prep_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes,proba_lr,proba_rf
0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0,0.114175,0.046250
1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1,0.612611,0.803333
2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0,0.154985,0.092917
3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0,0.545431,0.141350
4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1,0.574045,0.213719
...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0,0.136923,0.122083
996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0,0.517829,0.687929
997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0,0.135555,0.010000
998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1,0.550995,0.829405


In [13]:
# Check accuracy of Random Forest
german_model_df['test_outcomes'] = german_prep_df['proba_rf'].apply(lambda x : 1 if x > 0.5 else 0)
german_model_df['check'] = german_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(german_model_df['check']) / len(german_model_df))
german_model_df = german_model_df.drop(columns=['test_outcomes', 'check'])

0.933


In [14]:
# Write recipe outputs
os.mkdir("datasets")
german_prep_df.to_csv("datasets/german_credit_prep.csv")

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'datasets'