### Importing required libraries

In [27]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import (
    OrdinalEncoder,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import time

In [28]:
# Loading data
data = pd.read_csv("credit_data.csv")
# data.head(5)
# data.drop(columns=["Unnamed: 0"])
# data.shape

In [29]:
train_input, test_input, train_target, test_target = train_test_split(
    data.drop(columns=["Approved_Flag"]),
    data["Approved_Flag"],
    test_size=0.2,
    random_state=42)

### Model Pipeline

In [30]:
# Define transformers for each feature type
ohe_transformers = [
    (
        "ohe_martial_status",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["MARITALSTATUS"],
    ),
    (
        "ohe_gender",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["GENDER"],
    ),
    (
        "ohe_last_prod_enq2",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["last_prod_enq2"],
    ),
    (
        "ohe_first_prod_enq2",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["first_prod_enq2"],
    ),
]

education_encoder = LabelEncoder()

ordinal_transformers = [
    (
        "oe_education",
        OrdinalEncoder(
            # Now, categories are expected to be numerical
            categories=[np.arange(7)]
        ),
        ["EDUCATION"],
    ),
]


# Create ColumnTransformer
trf1 = ColumnTransformer(
    transformers=ohe_transformers + ordinal_transformers, remainder="passthrough"
)

trf1.get_params

In [31]:
# Scaling
trf2 = ColumnTransformer([("scale", MinMaxScaler(), slice(0, 51))])

In [32]:
# Label Encoder (for Categorical Targets Only)
le = LabelEncoder()
le.fit(train_target)

train_target = le.transform(train_target)
test_target = le.transform(test_target)

In [33]:
# Creating classifier using XGBOOST Algorithm
clf = xgb.XGBClassifier(
    objective="multi:softmax", num_class=4, eta=0.15, gamma=0.2, max_depth=5
)

pipe = Pipeline([("trf1", trf1), ("trf2", trf2), ("clf", clf)])
pipe.fit(train_input, train_target)


pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_martial_status',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['MARITALSTATUS']),
                                 ('ohe_gender',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['GENDER']),
                                 ('ohe_last_prod_enq2',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['last_prod_enq2']),
                 

### To check Underfitting

In [34]:
train_pred = pipe.predict(train_input)

In [35]:
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

In [36]:
accuracy_score(train_target,train_pred)

0.8211345873822472

### Model Accuracy

In [37]:
test_pred = pipe.predict(test_input)
test_pred

array([1, 3, 1, ..., 2, 1, 3], dtype=int32)

In [38]:
accuracy_score(test_target,test_pred)

0.7760608581956496

In [39]:
overall_f1_score = precision_recall_fscore_support(test_target,test_pred, average='weighted')[2]
print(f"Overall F1 Score: {overall_f1_score:.2f}")

Overall F1 Score: 0.76


### Prediction on Unseen Data

In [50]:
# Load the unseen data from an Excel file
unseen_data = pd.read_excel("Unseen_Dataset.xlsx")

# Handle unknown categories in OrdinalEncoder
# Get the OrdinalEncoder from the pipeline
ordinal_encoder = pipe.named_steps['trf1'].transformers_[4][1]

# Set handle_unknown to 'use_encoded_value' and unknown_value to -1
ordinal_encoder.set_params(handle_unknown='use_encoded_value', unknown_value=-1)

# Use the trained pipeline to predict the target variable for the unseen data
predicted_labels = pipe.predict(unseen_data)

# Add the predicted labels as a new column to the unseen data DataFrame
unseen_data['Predicted_Label'] = predicted_labels

# Save the updated DataFrame with the predicted labels to a new Excel file
unseen_data.to_excel("unseen_data_with_predictions.xlsx", index=False)

# Print and display the predictions in a Jupyter Notebook
print("Predictions for unseen data:")

Predictions for unseen data:


In [49]:
unseen_data['Predicted_Label'].unique()

array([1, 0, 2, 3], dtype=int32)