In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings
warnings.simplefilter(action= 'ignore')

In [2]:
# Import tools to explore dataset * Note more may be needed later.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# URL variable for the dataset to use in pd.read
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"

# Load the dataset and skip the first row header since we don't need that.
df = pd.read_excel(url, skiprows=[0])

# Display the first 9 rows
df.head(9)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,7,500000,1,1,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,8,100000,2,2,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


In [3]:
# column names normalization

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
last_column = df.columns[-1]
df = df[[last_column] + list(df.columns[:-1])]

In [5]:


df.rename(columns={'pay_0': 'sept_repayment', 'pay_2': 'aug_repayment','pay_3': 
                   'july_repayment', 'pay_4': 'june_repayment','pay_5': 'may_repayment',
                   'pay_6': 'april_repayment'}, inplace=True)

In [6]:

df.rename(columns={'bill_amt1': 'sept_billAmt', 'bill_amt2': 'aug_billAmt','bill_amt3': 
                   'july_billAmt', 'bill_amt4': 'june_billAmt','bill_amt5': 'may_billAmt',
                   'bill_amt6': 'april_billAmt'}, inplace=True)

In [7]:
df.rename(columns={'pay_amt1': 'sept_AmtPaid', 'pay_amt2': 'aug_AmtPaid','pay_amt3': 
                   'july_AmtPaid', 'pay_amt4': 'june_AmtPaid','pay_amt5': 'may_AmtPaid',
                   'pay_amt6': 'april_AmtPaid'}, inplace=True)

In [8]:
df['education'] = df['education'].replace({5: 4, 6: 4,0: 4})

In [9]:
df['education'].value_counts()

education
2    14030
1    10585
3     4917
4      468
Name: count, dtype: int64

In [10]:
# Mapping  -2, -1, and 0 to -1 to indicate "paid on time" and mapping all the low instances into 3
df[['sept_repayment', 'aug_repayment', 'july_repayment', 'june_repayment', 'may_repayment', 'april_repayment']] = df[['sept_repayment', 'aug_repayment', 'july_repayment', 'june_repayment', 'may_repayment', 'april_repayment']].replace({-2: -1, 0: -1,8:3,7:3,6:3,5:3,4:3})


In [11]:
df['april_repayment'].value_counts()

april_repayment
-1    26921
 2     2766
 3      313
Name: count, dtype: int64

In [12]:
df['marriage'] = df['marriage'].replace({0: 3})

In [13]:

df.drop('id', axis=1, inplace=True)



In [14]:
df.duplicated().sum()

36

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.reset_index(drop=True, inplace=True)

In [17]:
len(df)

29964

In [18]:
X = df.drop(['default_payment_next_month'],axis = 1)
y = df['default_payment_next_month']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

In [21]:
x_train_scaled= scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)

In [22]:
from imblearn.under_sampling import TomekLinks,NearMiss
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.combine import SMOTETomek

def sampler_function(data_x, data_y, sampler = 0, random_state = 42):
    
    if sampler == 0:
        sampler = RandomOverSampler(random_state = random_state)
    elif sampler == 1:
        sampler = TomekLinks()
    elif sampler == 2:
        sampler = SMOTE()
    elif sampler == 3:
        sampler = SMOTETomek()
    else: 
        sampler = NearMiss()
    X_transformed, y_transformed = sampler.fit_resample(data_x, data_y)
    
    print('Original dataset shape:', data_y.shape)
    print('Resample dataset shape:', y_transformed.shape)
    
    return X_transformed, y_transformed

In [23]:
X_train_res, y_train_res = sampler_function(x_train_scaled, y_train,sampler =2)

Original dataset shape: (20974,)
Resample dataset shape: (32704,)


In [24]:
from sklearn.ensemble import GradientBoostingClassifier

gb_resampled = GradientBoostingClassifier()

gb_resampled.fit(X_train_res, y_train_res)

y_pred_gb_resampled = gb_resampled.predict(x_test_scaled)

print(classification_report(y_test,y_pred_gb_resampled))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86      6982
           1       0.53      0.52      0.52      2008

    accuracy                           0.79      8990
   macro avg       0.70      0.69      0.69      8990
weighted avg       0.79      0.79      0.79      8990



In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
cv_scores_gb_resampled = cross_val_score(gb_resampled, X_train_res, y_train_res, cv=5)
print("Cross-validation accuracy: {:.2f} %".format(cv_scores_gb_resampled.mean()*100))

Cross-validation accuracy: 77.88 %


In [35]:
pwd

'/Users/sot/Documents/gitPractical/SDS-CP014-credit-card-default-pred/web-app/Patrick_APP'

In [29]:
import joblib

In [30]:
final_model = gb_resampled

In [31]:
final_scaler = scaler

In [36]:
joblib.dump(final_model, 'gb_credit_model.pk1')

['gb_credit_model.pk1']

In [37]:
joblib.dump(final_scaler, 'gb_credit_scaler.pk1')

['gb_credit_scaler.pk1']

In [39]:
loaded_model = joblib.load('gb_credit_model.pk1')

In [40]:
loaded_model

In [41]:
loaded_scaler = joblib.load('gb_credit_scaler.pk1')

In [42]:
y_train.head(2)

27056    1
18468    1
Name: default_payment_next_month, dtype: int64

In [43]:
X_train.head(1)

Unnamed: 0,limit_bal,sex,education,marriage,age,sept_repayment,aug_repayment,july_repayment,june_repayment,may_repayment,...,july_billAmt,june_billAmt,may_billAmt,april_billAmt,sept_AmtPaid,aug_AmtPaid,july_AmtPaid,june_AmtPaid,may_AmtPaid,april_AmtPaid
27056,80000,1,2,1,39,2,2,2,2,2,...,55132,56629,57405,58742,1200,5000,3000,2300,2400,0


In [44]:
data = X_train.iloc[0]

In [45]:
scaled_data = loaded_scaler.transform(data.values.reshape(1, -1))

In [46]:
prediction = loaded_model.predict(scaled_data)

In [47]:
prediction 

array([1])

In [2]:
!pip list



Package                   Version
------------------------- --------------
altair                    5.5.0
anyio                     4.7.0
appnope                   0.1.4
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.4
attrs                     24.2.0
babel                     2.16.0
beautifulsoup4            4.12.3
bleach                    6.2.0
blinker                   1.9.0
cachetools                5.5.0
certifi                   2024.8.30
cffi                      1.17.1
charset-normalizer        3.4.0
click                     8.1.7
comm                      0.2.2
contourpy                 1.3.0
cycler                    0.12.1
debugpy                   1.8.9
decorator                 5.1.1
defusedxml                0.7.1
exceptiongroup            1.2.2
executing                 2.1.0
fastjsonschema            2.21.1
fonttools                 4.55.2
fqdn            