In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from google.colab import files


In [None]:
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_excel(filename, sheet_name="Loan Data")

In [None]:
print("Dataset Info:")
df.info()

In [None]:
print("\nFirst 5 Rows:")
df.head(10)

In [None]:
print("\nMissing Values:")
df.isnull().sum()

In [None]:
# Clean the Data (Scope 1)
# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
# Impute numeric with median
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [None]:
# Impute categorical with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
# Handle outliers (cap at 1.5*IQR)
# Columns you’re checking
cols = ['loan_amount', 'annual_income', 'property_value']

# One subplot per column
fig, axes = plt.subplots(1, len(cols), figsize=(5 * len(cols), 4), sharey=False)

for ax, col in zip(axes, cols):

    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR


    mask_out = (df[col] < lower_bound) | (df[col] > upper_bound)
    mask_in  = ~mask_out

    ax.scatter(df.index[mask_in],  df.loc[mask_in,  col], s=12, label='In‑range')
    ax.scatter(df.index[mask_out], df.loc[mask_out, col], s=12, color='red', label='Outlier')
    ax.set_title(col.replace('_', ' ').title())
    ax.set_xlabel('Row index')
    ax.set_ylabel(col)
    ax.legend()

plt.tight_layout()
plt.show()


In [None]:
print("\nAfter Cleaning - Missing Values:")
print(df.isnull().sum())

In [None]:
print("\n\033[1mSummary Statistics (Numerical):\033[0m")
print(df[['loan_amount', 'loan_term_months', 'schufa_score', 'annual_income']].describe())
print("\n\033[1mSummary Statistics (Categorical):\033[0m")
print(df['property_ownership'].value_counts())
print(df['state'].value_counts())

In [None]:
# Payment history summary
payment_data = pd.read_excel(filename, sheet_name="Payment History")
print("\n\033[1mPayment History Summary:\033[0m")
print(payment_data[['amount_due', 'amount_paid', 'days_late']].describe())

In [None]:
#distribution of credit scores
plt.figure(figsize=(6,3))
sns.histplot(df["schufa_score"], kde=True)
plt.title("SCHUFA Score Distribution")
plt.xlabel("Score")
plt.tight_layout()
plt.show()

In [None]:
# Define Risk Categories (Scope 2)
# Define risk based on Schufa score, debt-to-income, and previous defaults
df['risk_category'] = pd.qcut(df['schufa_score'], q=3, labels=['High', 'Medium', 'Low'])
df.loc[(df['debt_to_income'] > df['debt_to_income'].quantile(0.75)) |
       (df['previous_defaults'] > 0), 'risk_category'] = 'High'
df.loc[(df['debt_to_income'] < df['debt_to_income'].quantile(0.25)) &
       (df['previous_defaults'] == 0), 'risk_category'] = 'Low'
print("\nRisk Category Distribution:")
print(df['risk_category'].value_counts())

In [None]:
#distribution of applicants across risk buckets
# Simple count plot (bars show how many rows fall into each bucket)
plt.figure(figsize=(6, 3))
sns.countplot(
    data=df,
    x="risk_category",
    order=["Low", "Medium", "High"],
    width=0.5
)

In [None]:
# ── Histogram: Schufa Score Distribution
plt.subplot(1, 2, 1)

for category in ['Low', 'Medium', 'High']:
    sns.histplot(
        df.loc[df['risk_category'] == category, 'schufa_score'],
        label=category,
        kde=True,
        alpha=0.7                    # slight transparency, keeps default colours
    )

plt.title('Schufa Score Distribution – Risk Profile', fontsize=12, pad=12)
plt.xlabel('Schufa Score', labelpad=8)

# Legend outside the axes, upper‑right
plt.legend(
    title='Risk Category',          # professional heading
    loc='upper left',
    bbox_to_anchor=(1.02, 1),       # x > 1 moves it outside
    frameon=True
)

plt.tight_layout()                  # adjust layout for the external legend


In [None]:

ax = sns.scatterplot(
        x='loan_amount',
        y='debt_to_income',
        hue='risk_category',
        size='default_flag',
        data=df,
        alpha=0.85                     # keep points slightly transparent
)

# Axis titles
ax.set_title('Loan Amount vs. Debt‑to‑Income – Risk Profile', fontsize=12, pad=12)
ax.set_xlabel('Loan Amount', labelpad=8)
ax.set_ylabel('Debt‑to‑Income Ratio', labelpad=8)

# Move the combined legend *outside* the plot, upper‑right
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels,
          title='Risk Category',                 # professional legend heading
          loc='upper left',
          bbox_to_anchor=(1.02, 1),              # x > 1 ⇒ legend outside
          frameon=True)

plt.tight_layout()  # ensure the figure accommodates the external legend



In [None]:
plt.figure(figsize=(8,6))
corr = df[features + ['default_flag']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title("Feature vs. Target Correlation (Raw Units)")
plt.show()



In [None]:
# Prepare Data for Modeling (Scope 3)
features = ['schufa_score', 'loan_amount', 'income_to_loan_ratio', 'debt_to_income',
            'previous_defaults', 'late_payments_30', 'late_payments_90']
categorical_cols = ['property_ownership', 'state']



In [None]:
# Encode categorical variables
df_encoded = pd.get_dummies(df[features + categorical_cols], columns=categorical_cols)

X = df_encoded               # predictors
y = df['default_flag']       # binary target: 1 = default, 0 = no default

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Build and Train Logistic Regression Model (Scope 3)
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate Model Performance (Scope 3)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])

In [None]:
print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {auc:.2f}")

In [None]:
proba = model.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, _ = roc_curve(y_test, proba)
auc  = roc_auc_score(y_test, proba)
plt.figure(figsize=(6,3))
plt.plot(fpr, tpr, linewidth=2, label=f"AUC = {auc:0.3f}")
plt.plot([0,1], [0,1], "k--", linewidth=1)
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC – Logistic Model")
plt.legend()
plt.tight_layout()
plt.savefig("roc_curve.png")
plt.show()

In [None]:
# Summarize Insights (Scope 4)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

In [None]:
print("\nKey Insights:")
print("- Schufa score and debt-to-income ratio are strong predictors of default.")
print(f"- Risk Category Distribution: {df['risk_category'].value_counts().to_dict()}")
print("\nFeature Importance:")
print(feature_importance)

This code loads a saved loan risk prediction model and its related files. It takes information about a loan applicant (like credit score, loan amount, and other financial details), prepares this data correctly, and then uses the model to estimate how likely it is that the applicant will default on the loan (fail to pay it back).

The code also creates a small web service (using Flask) that lets other programs send applicant data and get back a risk score in real-time. This way, the loan system can quickly decide if a person is a good or risky borrower.

In [None]:
import os

base_path = os.getcwd()  # current working directory

with open(os.path.join(base_path, "model.pkl"), "rb") as f:
    model = pickle.load(f)
with open(os.path.join(base_path, "scaler.pkl"), "rb") as f:
    scaler = pickle.load(f)
with open(os.path.join(base_path, "columns.pkl"), "rb") as f:
    columns = pickle.load(f)




In [None]:
print("Saved model.pkl, scaler.pkl, columns.pkl")

In [None]:
def predict_single(sample: dict) -> dict:
    """Return probability & class for one applicant dict."""
    df_s = pd.DataFrame([sample])
    df_enc = pd.get_dummies(df_s, columns=categorical_cols)
    df_align = df_enc.reindex(columns=X.columns, fill_value=0)
    df_align[numeric_features] = scaler.transform(df_align[numeric_features])
    proba = model.predict_proba(df_align)[0,1]
    return {"probability": round(float(proba),4), "class": int(proba>=0.5)}




In [None]:
test_applicant = {
    "schufa_score": 700,
    "loan_amount": 15000,
    "income_to_loan_ratio": 2.3,
    "debt_to_income": 0.35,
    "previous_defaults": 0,
    "late_payments_30": 1,
    "late_payments_90": 0,
    "property_ownership": "mortgage",
    "state": "BY"
}


In [None]:
from google.colab import files
for f in ["model.pkl","scaler.pkl","columns.pkl"]:
    files.download(f)


In [None]:
%%writefile app.py
from flask import Flask, request, jsonify
import pickle, pandas as pd

# --- Load artefacts ---------------------------------------------------------
with open("model.pkl", "rb") as f:   model   = pickle.load(f)
with open("scaler.pkl", "rb") as f:  scaler  = pickle.load(f)
with open("columns.pkl", "rb") as f: columns = pickle.load(f)

numeric_features = [
    "schufa_score","loan_amount","income_to_loan_ratio",
    "debt_to_income","previous_defaults",
    "late_payments_30","late_payments_90"
]
categorical_cols = ["property_ownership","state"]

app = Flask(__name__)

@app.route("/")
def home():
    return "Default‑probability Logistic Regression API is running!"

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json()
    df  = pd.DataFrame([data])
    df  = pd.get_dummies(df, columns=categorical_cols)
    df  = df.reindex(columns=columns, fill_value=0)
    df[numeric_features] = scaler.transform(df[numeric_features])
    proba = model.predict_proba(df)[0,1]
    return jsonify({"probability": round(float(proba),4),
                    "class": int(proba>=0.5)})

if __name__ == "__main__":
    app.run(debug=True)


In [None]:
from google.colab import files
files.download('app.py')
