In [6]:
pip install pandas numpy scikit-learn matplotlib seaborn streamlit joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# First, try to import streamlit
try:
    import streamlit as st
except ImportError:
    # If not found, install it
    import sys
    !{sys.executable} -m pip install streamlit
    import streamlit as st

# Now you can use streamlit
st.write("Hello Streamlit in Jupyter!")



In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import joblib
import streamlit as st

# Load the dataset
df = pd.read_csv('german_credit_data.csv', index_col=0)

# Display basic info
print(df.info())
print(df.head())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 78.1+ KB
None
   Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0   67    male    2     own             NaN           little           1169   
1   22  female    2     own          little         moderate           5951   
2   49    male    1     own          little              NaN           2096   
3   45    male    2   

In [9]:
# Create a target variable (we'll assume 'Credit amount' > median is higher risk)
# In a real scenario, we would have an actual target variable
median_credit = df['Credit amount'].median()
df['Risk'] = np.where(df['Credit amount'] > median_credit, 1, 0)  # 1=Higher risk, 0=Lower risk

# Drop the original 'Credit amount' as we've used it to create our target
df.drop('Credit amount', axis=1, inplace=True)

# Handle missing values
df['Saving accounts'].fillna('none', inplace=True)
df['Checking account'].fillna('none', inplace=True)

# Feature engineering
df['Age_Group'] = pd.cut(df['Age'], bins=[18, 25, 35, 45, 60, 100], 
                        labels=['18-25', '26-35', '36-45', '46-60', '60+'])
df['Duration_Group'] = pd.cut(df['Duration'], bins=[0, 12, 24, 36, 100], 
                            labels=['0-1y', '1-2y', '2-3y', '3y+'])

# Separate features and target
X = df.drop('Risk', axis=1)
y = df['Risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing for numeric and categorical features
numeric_features = ['Age', 'Duration']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Sex', 'Job', 'Housing', 'Saving accounts', 
                       'Checking account', 'Purpose', 'Age_Group', 'Duration_Group']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Visualizations
def plot_distributions(df):
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    sns.countplot(x='Risk', data=df, ax=axes[0, 0])
    axes[0, 0].set_title('Target Distribution')
    
    sns.histplot(df['Age'], bins=20, kde=True, ax=axes[0, 1])
    axes[0, 1].set_title('Age Distribution')
    
    sns.countplot(x='Purpose', data=df, ax=axes[1, 0])
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].set_title('Loan Purpose Distribution')
    
    sns.boxplot(x='Risk', y='Duration', data=df, ax=axes[1, 1])
    axes[1, 1].set_title('Loan Duration by Risk')
    
    plt.tight_layout()
    return fig

plot_distributions(df)
plt.show()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Saving accounts'].fillna('none', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Checking account'].fillna('none', inplace=True)
  plt.show()


In [10]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# Hyperparameter grids
param_grids = {
    'Random Forest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5]
    },
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2']
    }
}

# Train and evaluate models
best_models = {}
results = []

for model_name in models:
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', models[model_name])
    ])
    
    # Grid search
    grid_search = GridSearchCV(pipeline, param_grids[model_name], 
                              cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Store best model
    best_models[model_name] = grid_search.best_estimator_
    
    # Evaluate
    y_pred = best_models[model_name].predict(X_test)
    y_proba = best_models[model_name].predict_proba(X_test)[:, 1]
    
    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba),
        'Best Params': grid_search.best_params_
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# Save best model
joblib.dump(best_models['Random Forest'], 'credit_risk_model.pkl')

                 Model  Accuracy  Precision  Recall        F1  ROC AUC  \
0        Random Forest     0.775   0.772277    0.78  0.776119  0.84105   
1  Logistic Regression     0.755   0.747573    0.77  0.758621  0.83805   

                                         Best Params  
0  {'classifier__max_depth': 10, 'classifier__min...  
1  {'classifier__C': 0.1, 'classifier__penalty': ...  


['credit_risk_model.pkl']

In [11]:
# Feature importance for Random Forest
best_model = best_models['Random Forest']

# Get feature names after one-hot encoding
preprocessor = best_model.named_steps['preprocessor']
feature_names = numeric_features.copy()

# Add one-hot encoded feature names
ohe_features = best_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names.extend(ohe_features)

# Get feature importances
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    importances = best_model.named_steps['classifier'].feature_importances_
    feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_importance = feature_importance.sort_values('Importance', ascending=False).head(20)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Top 20 Important Features')
    plt.tight_layout()
    plt.show()

# Confusion matrix
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Low Risk', 'High Risk'], 
                yticklabels=['Low Risk', 'High Risk'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    return plt

plot_confusion_matrix(y_test, best_model.predict(X_test))
plt.show()

# ROC Curve
def plot_roc_curve(y_true, y_proba):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = roc_auc_score(y_true, y_proba)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    return plt

plot_roc_curve(y_test, best_model.predict_proba(X_test)[:, 1])
plt.show()

  plt.show()
  plt.show()
  plt.show()
