**Project: Credit Risk Analysis**

The goal of this project is to build a machine learning model that can predict the credit risk of borrowers based on their financial information. The dataset used in this project is the German Credit Data dataset, which contains information on 1,000 loan applicants and their credit risk.

# Import the data

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', header=None, delimiter=' ')

In [2]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


# Clean and preprocess the data
Removing missing values, converting categorical variables to numerical variables, and scaling the data

In [3]:
# information about codes: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Set the column names
columns = ['checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_account', 'employment_duration', 'installment_rate', 'personal_status', 'other_debtors', 'residence_duration', 'property', 'age', 'other_installment_plans', 'housing', 'existing_credits', 'job', 'dependents', 'telephone', 'foreign_worker', 'credit_risk']
df.columns = columns

In [4]:
# Remove missing values
df = df.dropna()

In [5]:
# Convert categorical variables to numerical variables
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [6]:
# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(df.drop(['credit_risk'], axis=1))
y = df['credit_risk'].values

# Train-test split

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train random forest ML model

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate performance


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the performance of the model on the test set
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-Score:', f1_score(y_test, y_pred))

Accuracy: 0.765
Precision: 0.7865853658536586
Recall: 0.9148936170212766
F1-Score: 0.8459016393442624


# Hyperparameter tuning

In [11]:
from sklearn.model_selection import GridSearchCV

# Fine-tune the model using hyperparameter tuning
params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
print('Best hyperparameters:', grid_search.best_params_)

Best hyperparameters: {'max_depth': 10, 'n_estimators': 50}


In [28]:
# Define the tuned hyperparameters
max_depth = list(grid_search.best_params_.values())[0]
n_estimators = list(grid_search.best_params_.values())[1]

# Create a RandomForestClassifier object with the tuned hyperparameters
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

# Retrain the model using the train set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy of the retrained model
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1-Score:', f1_score(y_test, y_pred))

Accuracy: 0.785
Precision: 0.7951807228915663
Recall: 0.9361702127659575
F1-Score: 0.8599348534201954


# Save model

In [29]:
import pickle

with open('credit_risk_model.pkl', 'wb') as f:
    pickle.dump(model, f)