In [None]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Read recipe inputs
german_df = pd.read_csv("german_credit.csv")
german_df = german_df.drop(columns=['col_0'])
german_df

In [None]:
cols = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking', 'credit_amt', 'duration', 'purpose', 'outcomes']
german_df['under_25'] = german_df['Age'].apply(lambda x : True if x < 25 else False)
german_df['sex'] = german_df['Sex'].apply(lambda x : x.capitalize())
german_df['job'] = german_df['Job'].apply(lambda x : 'None' if x == 0 else '1 Job' if x == 1 else 'Multiple Jobs')
german_df['housing'] = german_df['Housing'].apply(lambda x : x.capitalize())
german_df['savings'] = german_df['Saving accounts'].apply(lambda x : x.capitalize() if x == x else 'N/A')
german_df['checking'] = german_df['Checking account'].apply(lambda x : x.capitalize() if x == x else 'N/A')
credit_amts = {0:'Low', 1:'Moderate', 2:'High', 3:'Very High'}
german_df['credit_amt'] = np.digitize(german_df['Credit amount'], [1000, 3000, 10000])
german_df['credit_amt'] = german_df['credit_amt'].apply(lambda x : credit_amts[x])
duration_lengths = {0:'Very Short', 1:'Short', 2:'Moderate', 3:'Long', 4:'Very Long'}
german_df['duration'] = np.digitize(german_df['Duration'], [6, 12, 24, 48])
german_df['duration'] = german_df['duration'].apply(lambda x : duration_lengths[x])
german_df['purpose'] = german_df['Purpose'].apply(lambda x : '/'.join([a.capitalize() if a != 'TV' else a for a in x.split('/')]))
german_df['outcomes'] = german_df['Risk'].apply(lambda x : 1 if x == 'bad' else 0) # take outcome = 1 to mean bad
german_prep_df = german_df[cols]
german_prep_df

In [None]:
print("Summary of all columns of the dataset:")
for col in german_prep_df.columns:
    print("\n")
    print(german_prep_df.loc[:,col].value_counts())
german_prep_df.head(10)

### Build Models on Whole Dataset

Select only covariates and outcomes to prepare for simulation.

One-Hot encode categorical features.

In [None]:
german_model_df = german_prep_df

In [None]:
for cat_col in german_model_df.select_dtypes(include=['object', 'bool']).columns:
    german_model_df[cat_col] = german_model_df[cat_col].astype('category')

In [None]:
X_num = german_model_df.select_dtypes(exclude=['category'])
X_cat = german_model_df.select_dtypes(include=['category'])

In [None]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
german_model_df = pd.concat(frames, axis=1)
german_model_df

In [None]:
extra_cols = ['under_25_False', 'sex_Female', 'job_None', 'housing_Free', 'savings_N/A', 'checking_N/A', 'credit_amt_Low', 'duration_Short']
german_model_df = german_model_df.drop(columns=extra_cols)
german_model_df

In [None]:
X = german_model_df.drop(columns=['outcomes'])
y = german_model_df['outcomes']

In [None]:
# Build Logistic Regression
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
german_model_df['proba_lr'] = proba
german_prep_df['proba_lr'] = proba
german_prep_df

In [None]:
# Check accuracy of Logistic Regression
german_model_df['test_outcomes'] = german_prep_df['proba_lr'].apply(lambda x : 1 if x > 0.5 else 0)
german_model_df['check'] = german_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(german_model_df['check']) / len(german_model_df))
german_model_df = german_model_df.drop(columns=['test_outcomes', 'check'])

In [None]:
# Build Random Forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X, y)
proba = rf.predict_proba(X)[:,1]
german_model_df['proba_rf'] = proba
german_prep_df['proba_rf'] = proba
german_prep_df

In [None]:
# Check accuracy of Random Forest
german_model_df['test_outcomes'] = german_prep_df['proba_rf'].apply(lambda x : 1 if x > 0.5 else 0)
german_model_df['check'] = german_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(german_model_df['check']) / len(german_model_df))
german_model_df = german_model_df.drop(columns=['test_outcomes', 'check'])

In [None]:
# Write recipe outputs
german_df.write_csv("datasets/german_credit_prep.csv")