# RBI Task Refactored

In [None]:
!pip install catboost shap python-pptx

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from catboost import CatBoostClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from pptx import Presentation
from pptx.util import Inches


In [None]:
class DataPreprocessor:
    """Handle data cleaning and feature engineering."""
    def __init__(self):
        self.label_encoders = {}

    @staticmethod
    def clean_value(x):
        if isinstance(x, str):
            x = re.sub(r"[^0-9,]+", "", x)
            x = x.replace(',', '.')
        return x

    def preprocess(self, data):
        target_mapping = {"Y": 1, "N": 0}
        if 'Mortgage_YN' in data.columns:
            data['target'] = data['Mortgage_YN'].map(target_mapping)
            data.drop(columns=['Mortgage_YN'], inplace=True)
        if 'Cocunut' in data.columns:
            data.drop(columns=['Cocunut'], inplace=True)

        if 'AGE_AT_ORIGINATION' in data.columns:
            condition_target_1 = data['target'] == 1
            data.loc[condition_target_1, 'AGE'] = data.loc[condition_target_1, 'AGE_AT_ORIGINATION']
            data.drop(columns=['AGE_AT_ORIGINATION'], inplace=True)

        for col in ['CURRENT_BALANCE_EUR', 'CUST_INCOME', 'CURRENT_ADDRESS_DATE',
                    'CURRENT_JOB_DATE', 'CURRENT_WITH_BANK_DATE']:
            if col in data.columns:
                tmp = data[col].apply(self.clean_value)
                data[col] = pd.to_numeric(tmp, errors='coerce')
        return data

    def encode(self, train, test):
        cat_cols = train.select_dtypes(exclude=np.number).columns
        for col in cat_cols:
            le = LabelEncoder()
            le.fit(list(train[col].astype(str)) + list(test[col].astype(str)))
            train[col] = le.transform(train[col].astype(str))
            test[col] = le.transform(test[col].astype(str))
            self.label_encoders[col] = le
        return train, test


In [None]:
# Load data
raw = pd.read_csv("Retail data.csv", delimiter=';')
preprocessor = DataPreprocessor()
data = preprocessor.preprocess(raw.copy())
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']
X_train, X_test = preprocessor.encode(X_train, X_test)


In [None]:
class MortgageModel:
    def __init__(self):
        self.model = None
        self.best_params = None

    def _objective(self, params):
        clf = CatBoostClassifier(
            iterations=int(params['iterations']),
            depth=int(params['depth']),
            learning_rate=params['learning_rate'],
            l2_leaf_reg=int(params['l2_leaf_reg']),
            border_count=int(params['border_count']),
            loss_function='Logloss',
            verbose=False
        )
        score = cross_val_score(clf, X_train, y_train, cv=5).mean()
        return {'loss': 1 - score, 'status': STATUS_OK}

    def tune_hyperparameters(self):
        space = {
            'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
            'iterations': hp.quniform('iterations', 50, 300, 1),
            'depth': hp.quniform('depth', 2, 8, 1),
            'l2_leaf_reg': hp.quniform('l2_leaf_reg', 1, 10, 1),
            'border_count': hp.quniform('border_count', 5, 255, 5),
        }
        trials = Trials()
        self.best_params = fmin(fn=self._objective, space=space, algo=tpe.suggest,
                                max_evals=50, trials=trials)
        return self.best_params

    def fit(self):
        params = {k:int(v) if k in ['iterations','depth','l2_leaf_reg','border_count'] else v
                  for k,v in self.best_params.items()}
        self.model = CatBoostClassifier(**params, loss_function='Logloss', verbose=False)
        self.model.fit(X_train, y_train)
        joblib.dump(self.model, 'catboost_model.pkl')

    def evaluate(self):
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)[:,1]
        print("ROC AUC:", roc_auc_score(y_test, y_proba))
        print(classification_report(y_test, y_pred))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.show()
        return y_proba


In [None]:
model = MortgageModel()
model.tune_hyperparameters()
model.fit()
y_proba = model.evaluate()


In [None]:
# SHAP explanations
explainer = shap.TreeExplainer(model.model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
plt.savefig('summary_plot.png')

prs = Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[5])
slide.shapes.title.text = 'SHAP Summary'
left = Inches(1)
top = Inches(1)
slide.shapes.add_picture('summary_plot.png', left, top, width=Inches(6))
prs.save('my_presentation.pptx')


In [None]:
# Prediction on new data
new_data = pd.read_csv("Potential Customers.csv", delimiter=';')
new_data = preprocessor.preprocess(new_data)
for col, le in preprocessor.label_encoders.items():
    if col in new_data.columns:
        new_data[col] = le.transform(new_data[col].astype(str))
new_pred = model.model.predict(new_data)
new_data['Model_Mortgage Prediction'] = new_pred
new_data.to_csv('predictions.csv', index=False)
