# Pipeline Overview

1. **Data Ingestion**: Load the dataset.
2. **Data Preprocessing**: Split the data into training and testing sets.
3. **Weight of Evidence (WoE) and Information Value (IV) Calculation**.
4. **Model Development**: This includes WoE calculation, feature selection using IV, and modeling with Logistic Regression.
5. **Scorecard Management Report**: Generate reports for both the training and testing phases.

In [None]:
ct = ColumnTransformer(
    [
        ('loan_int_rate_imputer', SimpleImputer(strategy='median'), ['loan_int_rate']),
        ('person_emp_length_imputer', SimpleImputer(strategy='constant', fill_value=0), ['person_emp_length']),
    ], 
    remainder="passthrough"
)

X_train = pd.DataFrame(ct.fit_transform(X_train), columns=X_train.columns)

In [1]:
# src/config/config.py
# This script is used to manage configuration for the pipeline such as filepaths, parameters, and schemas

class ConfigurationManager:
    """Create a """
    def __init__(self):


In [2]:
# src/data/data_ingestion.py
import pandas as pd
import polars as pl
from abc import ABC
from abc import abstractmethod
from pathlib import Path

class DataIngestionStrategy(ABC):
    @abstractmethod
    def ingest_data(self, paths: list):
        pass

class PandasDataIngestionStrategy(DataIngestionStrategy):
    def ingest_data(self, paths: list) -> pd.DataFrame:
        df = pd.DataFrame({})
        for path in paths:
            path = Path(path)
            start = time.perf_counter()
            if path.suffix == ".csv":
                temp_df = pd.read_csv(path)
            elif path.suffix == ".parquet":
                temp_df = pd.read_parquet(path)
            df = pd.concat([df, temp_df], axis=0)
        return df

class PolarsDataIngestionStrategy(DataIngestionStrategy):
    def ingest_data(self, paths: list) -> pl.LazyFrame:
        df = None
        for path in paths:
            path = Path(path)
            start = time.perf_counter()
            if path.suffix == ".csv":
                temp_df = pl.scan_csv(path)
            elif path.suffix == ".parquet":
                temp_df = pl.scan_parquet(path)
            if df is None:
                df = temp_df
            else:
                df = pl.concat([df, temp_df], how="vertical")
        return df

class DataIngestion:
    @staticmethod
    def ingest_data(paths: str, strategy: DataIngestionStrategy):
        try:
            return strategy.ingest_data(paths)
        except Exception as e:
            logging.error(e)

In [3]:
# Load data
df = DataIngestion.ingest_data(
    paths=[
        "../data/raw/credit_risk_dataset.csv"
    ],
    strategy=PandasDataIngestionStrategy()
)
display(df.head())
display(df.info())
display(df.describe())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


None

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [4]:
# src/data/data_preprocessing.py
import pandas as pd
from typing import Tuple
from typing import Union
from sklearn.model_selection import train_test_split

class DataPreprocessing:
    @staticmethod
    def split_data(df: pd.DataFrame, target_variable: str, test_size: float, random_state: int) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
        """
        Split data into train and test data evenly based on their target_variable values.

        Args:
            df (pd.DataFrame): Pandas DataFrame containing the data.
            target_variable (str): target_variable column.
            test_size (float): Proportion of dataset pick testing.
            random_state (int): Random state for splitting data.
        Returns:
            Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: Train and test set.
        """
        X, y = df.drop(columns=[target_variable]), df[target_variable]
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, shuffle=True, random_state=42)
        train = pd.concat([X_train, y_train], axis=1)
        test = pd.concat([X_test, y_test], axis=1)
        return (X_train, X_test, y_train, y_test)

In [6]:
# Prepare arguments
df = DataIngestion.ingest_data(
    paths=[
        "../data/raw/credit_risk_dataset.csv"
    ],
    strategy=PandasDataIngestionStrategy()
)
target_variable = 'loan_status'
test_size = 0.2
random_state = 42

# split data betweem train and text
X_train, X_test, y_train, y_test = DataPreprocessing.split_data(
    df=df, 
    target_variable=target_variable,
    test_size=test_size,
    random_state=random_state
)

print(f'Train features: {X_train.shape}, Train Target: {y_train.shape}')
print(f'Test features: {X_test.shape}, Test Target: {y_test.shape}')

Train features: (26064, 11), Train Target: (26064,)
Test features: (6517, 11), Test Target: (6517,)


In [None]:
# src/models/train.py