## Credit Risk Probability Model for Alternative Data

### An End-to-End Implementation for Building, Deploying, and Automating a Credit Risk Model

In [None]:
# Import necessary lobraries
import sys
import os


notebook_dir = os.getcwd()  # Get current working directory (notebooks folder)
project_dir = os.path.abspath(os.path.join(notebook_dir, '..'))  # Go up one level
src_dir = os.path.join(project_dir, 'src')  # Path to scripts
sys.path.append(src_dir)

In [None]:
import pandas as pd
from data_loader import DataLoader
from data_eda import Dataprocessor
from data_processing import build_full_pipeline
from proxy_target_engineer import ProxyTargetEngineer

## Data Loading

In [None]:
#Import data
file_path = "../data/raw/data.csv"
data = DataLoader()
df = data.load_data(file_path)
df.head()

## Data Overviewing


In [None]:
# Overview Data
data_eda = Dataprocessor(df)
data_overview = data_eda.overview_data()
print(data_overview)

## Distribution of Numerical Features


In [None]:
# Visualize data distribution on numerical features
data_dist_num = data_eda.plot_numerical_distributions()
print(data_dist_num)

## Distribution of Categorical Features

In [None]:
# Visualize data distribution on categorical features
data_dist_cat = data_eda.plot_categorical_distributions()
print(data_dist_cat)

## Correlation Analysis

In [None]:
# Visualize correlation analysis
data_corr = data_eda.correlation_heatmap()
print(data_corr)

## Missing Values

In [None]:
#print out missing values
data_missing = data_eda.missing_value_summary()
print(data_missing)

## Outlier Detection using Boxplots

In [None]:
#Visualize outlier values
data_outliers = data_eda.plot_outliers()
print(data_outliers)

## Proxy Target Engineering

In [None]:
proxy = ProxyTargetEngineer(
    customer_id_col='CustomerId',
    timestamp_col='TransactionStartTime',
    amount_col='Amount'
)
rfm_labeled = proxy.engineer_target(df)
df_labeled = proxy.merge_with_main(df, rfm_labeled)

## Define target and raw features

In [None]:
# --- Step 2: Define target and raw features ---
y = df_labeled['is_high_risk']
X_raw = df_labeled.copy()

## Build pipeline to transform data

In [None]:
pipeline = build_full_pipeline(
    numeric_features=['transaction_count', 'total_amount', 'avg_amount', 'std_amount'],
    categorical_features=['month', 'hour'],
    timestamp_col='TransactionStartTime',
    amount_col='Amount',
    customer_id_col='CustomerId'
)
X_processed = pipeline.fit_transform(X_raw)

In [None]:
print(X_processed)

## Save Data

In [None]:
# --- Step 4: Save processed data ---
y.to_csv("../data/processed/data_targeted.csv", index=False)
pd.DataFrame(X_processed).to_csv("../data/processed/data_labeled.csv", index=False)