In [1]:

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib
from datetime import datetime

In [2]:
# Load data
df = pd.read_csv('../data/processed/cleaned_data_20240621_143909.csv')

In [3]:

# Drop duplicates and irrelevant columns
df.drop_duplicates(inplace=True)
df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')

In [4]:
# Define the numerical and categorical features
numeric_features = ['loan_amount', 'number_of_defaults', 'outstanding_balance', 'interest_rate', 'age', 'remaining_term', 'salary']
categorical_features = ['gender', 'disbursement_date', 'currency', 'country', 'is_employed', 'job', 'location', 'marital_status', 'loan_status']


In [5]:
# Binning functions
def bin_age(age):
    bins = [0, 18, 30, 40, 50, float('inf')]
    labels = ['0-18', '19-30', '31-40', '41-50', '51+']
    return pd.cut(age, bins=bins, labels=labels, right=False).astype(str)

def bin_salary(salary):
    salary_bins = [0, 2273.93, 2665.44, 3146.58, 10000]
    salary_labels = ['Low', 'Medium-Low', 'Medium-High', 'High']
    return pd.cut(salary, bins=salary_bins, labels=salary_labels, right=False).astype(str)


In [6]:
# Custom transformer for age binning
def age_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_age(col)).values

# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values


In [7]:
# Define the preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [8]:
# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('age_bin', FunctionTransformer(age_bin_transformer), ['age']),
        ('salary_bin', FunctionTransformer(salary_bin_transformer), ['salary'])
    ],
    remainder='passthrough'
)

In [10]:
# Define the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [11]:
X = df.drop(columns=['target'])
y = df['target']

In [12]:
X.head()

Unnamed: 0,gender,disbursement_date,currency,country,is_employed,job,location,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,marital_status,loan_status
0,female,2022-10-29,USD,Zimbabwe,True,Teacher,Beitbridge,39000.0,0,48653.011473,0.22,37,47,3230.038869,married,Did not default
1,other,2020-06-06,USD,Zimbabwe,True,Teacher,Harare,27000.0,2,28752.062237,0.2,43,62,3194.139103,single,Did not default
2,other,2023-09-29,USD,Zimbabwe,True,Nurse,Gweru,35000.0,1,44797.554126,0.22,43,57,3330.826656,married,Did not default
3,female,2022-06-22,USD,Zimbabwe,True,Doctor,Rusape,24000.0,0,35681.496413,0.23,47,42,2246.79702,divorced,Did not default
4,male,2023-02-08,USD,Zimbabwe,True,Nurse,Chipinge,19000.0,0,34156.055882,0.2,42,45,2310.858441,married,Did not default


In [13]:
# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('age_bin', Pipeline([
            ('binning', FunctionTransformer(age_bin_transformer)),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), ['age']),
        ('salary_bin', Pipeline([
            ('binning', FunctionTransformer(salary_bin_transformer)),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), ['salary'])
    ],
    remainder='passthrough'
)

In [14]:
# Define the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

In [15]:
# Fit the pipeline and transform the data
pipeline.fit(X)
X_transformed = pipeline.transform(X)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Save the preprocessed data and the pipeline
pd.DataFrame(X_train).to_csv('../data/train/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('../data/test/X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('../data/train/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/test/y_test.csv', index=False)

In [17]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
joblib.dump(pipeline, f'../pipelines/data_processing_pipeline_{timestamp}.pkl')

['../pipelines/data_processing_pipeline_20240710_170133.pkl']

: 