In [18]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [19]:
# Step 2: Load Dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Display column names
print("Columns:", df.columns.tolist())
df.head()


Columns: ['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings', 'employment_duration', 'installment_rate', 'personal_status_sex', 'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans', 'housing', 'number_credits', 'job', 'people_liable', 'telephone', 'foreign_worker', 'credit_risk']


Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,... < 100 DM,6,critical account/other credits existing,domestic appliances,1169,unknown/no savings account,... >= 7 years,4,male : single,none,...,real estate,67,none,own,2,skilled employee/official,1,yes,yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,...,real estate,22,none,own,1,skilled employee/official,1,no,yes,0
2,no checking account,12,critical account/other credits existing,retraining,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,...,real estate,49,none,own,1,unskilled - resident,2,no,yes,1
3,... < 100 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,...,building society savings agreement/life insurance,45,none,for free,1,skilled employee/official,2,no,yes,1
4,... < 100 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,...,unknown/no property,53,none,for free,2,skilled employee/official,2,no,yes,0


In [20]:
# Step 3: Set Target Column
target_column = 'credit_risk'  # this is correct based on your screenshot

# Encode target if it's not numeric
le_target = LabelEncoder()
df[target_column] = le_target.fit_transform(df[target_column])  # good -> 1, bad -> 0


In [21]:
# Step 4: Encode Categorical Features
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object' and col != target_column:
        df[col] = le.fit_transform(df[col])


In [22]:
# Step 5: Split Data
X = df.drop(target_column, axis=1)
y = df[target_column]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [23]:
# Step 6: Train Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.785
              precision    recall  f1-score   support

           0       0.71      0.46      0.56        59
           1       0.80      0.92      0.86       141

    accuracy                           0.79       200
   macro avg       0.76      0.69      0.71       200
weighted avg       0.78      0.79      0.77       200



In [26]:
# Step 7: Simulate Real-Time Prediction
def predict_credit_risk(sample_input):
    # Make sure the input is a 2D array
    input_scaled = scaler.transform(np.array([sample_input]))

    # Predict
    prediction = model.predict(input_scaled)[0]

    # Decode label
    result = le_target.inverse_transform([prediction])[0]  # returns 'good' or 'bad'

    # No .upper() on integer!
    if result == 'good':
        return f"Prediction: GOOD – Loan Approved ✅"
    else:
        return f"Prediction: BAD – Loan Denied ❌"
sample_input = X.iloc[0].tolist()
print(predict_credit_risk(sample_input))


Prediction: BAD – Loan Denied ❌




In [28]:
# Show the actual values from one customer
sample_input = X.iloc[0]
print("📋 Sample customer data:\n")
print(sample_input)



📋 Sample customer data:

status                        0
duration                      6
credit_history                1
purpose                       3
amount                     1169
savings                       4
employment_duration           1
installment_rate              4
personal_status_sex           3
other_debtors                 2
present_residence             4
property                      2
age                          67
other_installment_plans       1
housing                       1
number_credits                2
job                           1
people_liable                 1
telephone                     1
foreign_worker                1
Name: 0, dtype: int64
