In [44]:

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib
from datetime import datetime

In [45]:
# Load data
df = pd.read_csv('../data/processed/cleaned_data_20240621_143909.csv')

In [46]:

# Drop duplicates and irrelevant columns
df.drop_duplicates(inplace=True)
df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')

In [47]:
# Define the numerical and categorical features
numeric_features = ['loan_amount', 'number_of_defaults', 'outstanding_balance', 'interest_rate', 'age', 'remaining_term', 'salary']
categorical_features = ['gender', 'disbursement_date', 'currency', 'country', 'is_employed', 'job', 'location', 'marital_status', 'loan_status']


In [48]:
# Binning functions
def bin_age(age):
    bins = [0, 18, 30, 40, 50, float('inf')]
    labels = ['0-18', '19-30', '31-40', '41-50', '51+']
    return pd.cut(age, bins=bins, labels=labels, right=False).astype(str)

def bin_salary(salary):
    salary_bins = [0, 2273.93, 2665.44, 3146.58, 10000]
    salary_labels = ['Low', 'Medium-Low', 'Medium-High', 'High']
    return pd.cut(salary, bins=salary_bins, labels=salary_labels, right=False).astype(str)


In [49]:
# Custom transformer for age binning
def age_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_age(col)).values

# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values


In [50]:
# Define the preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [51]:
# Custom transformer for salary binning
def salary_bin_transformer(X):
    return pd.DataFrame(X).apply(lambda col: bin_salary(col)).values

In [52]:
# Define the preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [53]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [54]:
# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('age_bin', FunctionTransformer(age_bin_transformer), ['age']),
        ('salary_bin', FunctionTransformer(salary_bin_transformer), ['salary'])
    ]
)

In [55]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [56]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('age_bin', FunctionTransformer(age_bin_transformer), ['age']),
        ('salary_bin', FunctionTransformer(salary_bin_transformer), ['salary'])
    ],
    remainder='passthrough'
)

In [57]:
# Define the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [58]:
X = df.drop(columns=['target'])
y = df['target']

In [59]:
X.columns

Index(['gender', 'disbursement_date', 'currency', 'country', 'is_employed',
       'job', 'location', 'loan_amount', 'number_of_defaults',
       'outstanding_balance', 'interest_rate', 'age', 'remaining_term',
       'salary', 'marital_status', 'loan_status'],
      dtype='object')

In [60]:
# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('age_bin', Pipeline([
            ('binning', FunctionTransformer(age_bin_transformer)),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['age']),
        ('salary_bin', Pipeline([
            ('binning', FunctionTransformer(salary_bin_transformer)),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['salary'])
    ],
    remainder='passthrough'
)

In [61]:
# Define the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

In [62]:
# Fit the pipeline and transform the data
pipeline.fit(X)
X_transformed = pipeline.transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


In [63]:
# Save the preprocessed data and the pipeline
pd.DataFrame(X_train).to_csv('../data/train/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('../data/test/X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('../data/train/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/test/y_test.csv', index=False)

In [64]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
joblib.dump(pipeline, f'../pipelines/data_processing_pipeline_{timestamp}.pkl')

['../pipelines/data_processing_pipeline_20240704_190620.pkl']