<a href="https://colab.research.google.com/github/nikkijha97/Project/blob/main/code_credit%20risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import logging
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import pickle


In [2]:
data = pd.read_csv("default of credit card clients.csv")

In [3]:
target = data['default payment next month'].values
target

array([1, 1, 0, ..., 1, 1, 1])

In [4]:
data = data.drop(['default payment next month'], axis = 1)
data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,1,20000,2,2,1,24,2,2,-1,-1,...,689,0,0,0,0,689,0,0,0,0
1,2,120000,2,2,2,26,-1,2,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,3,90000,2,2,2,34,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,4,50000,2,2,1,37,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,5,50000,1,2,1,57,-1,0,-1,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,208365,88004,31237,15980,8500,20000,5003,3047,5000,1000
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,3502,8979,5190,0,1837,3526,8998,129,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,2758,20878,20582,19357,0,0,22000,4200,2000,3100
29998,29999,80000,1,3,1,41,1,-1,0,0,...,76304,52774,11855,48944,85900,3409,1178,1926,52964,1804


In [5]:
for variable in data.columns[6:12]:
    data[variable + "_no_card_use"] = np.where(data[variable] == -2, 1, 0)
    data[variable + "_payed_off"] = np.where(data[variable] == -1, 1, 0)
    data[variable] = np.where(data[variable] < 0, 0, data[variable])

In [6]:
categorical_features=['SEX', 'EDUCATION', 'MARRIAGE']
preprocessor = Pipeline([
  # Step 1: Apply one-hot encoding to categorical features
        ('categorical', ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
                 categorical_features)
            ],
            remainder='passthrough'  # Keep numerical features as is
        )),
        # Step 2: Scale all features (both encoded and numerical)
        ('scaler', StandardScaler())
    ])
categorical_features

['SEX', 'EDUCATION', 'MARRIAGE']

In [7]:
preprocessor.fit(data)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
pre_data = preprocessor.transform(data)

In [9]:
def optimize_classifier(train_data, train_target):
    """
    Perform grid search to find the best classifier parameters
    """
    classifier = GradientBoostingClassifier(random_state=1000)
    optimizer = GridSearchCV(
        classifier,
        param_grid={
            'learning_rate': [0.05],
            'n_estimators': [250],
            'max_depth': [5],
            'subsample': [0.6],
            'max_features': ['sqrt', 'log2']
        },
        cv=10,
        scoring='roc_auc',
        n_jobs=4,
        verbose=1
    )

    optimizer.fit(train_data, train_target)
    return optimizer.best_estimator_


In [10]:
train_data, test_data,train_target,test_target = train_test_split(
    pre_data,
    target,
    train_size=0.8,
    random_state=1000,
    shuffle=True)

In [11]:

best_estimator = optimize_classifier(train_data, train_target)

fitted_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Fitted preprocessor
    ('classifier', best_estimator)  # Fitted model
])

Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [12]:
#filename = "preprocessor.pkl"
#pickle.dump(preprocessor, open(filename, 'wb'))
filename = "fitted_pipeline.pkl"
pickle.dump(fitted_pipeline, open(filename, 'wb'))