In [497]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [498]:
customer=pd.read_csv("C:\\Users\\nguye\\OneDrive\\Desktop\\ML\\archive\\customer_data.csv")
customer=customer.sort_values('id',axis=0,ascending=True)
customer.fillna(customer['fea_2'].mean(),inplace=True)
customer['label'].value_counts()

0    900
1    225
Name: label, dtype: int64

customer’s card payment history :

id: customer id 

OVD_t1: number of times overdue type 1

OVD_t2: number of times overdue type 2

OVD_t3: number of times overdue type 3

OVD_sum: total overdue days //tổng số ngày quá hạn

pay_normal: number of times normal payment //số lần thanh toán thường

prod_code: credit product code //mã tín dụng

prod_limit: credit limit of product // hạn mức tín dụng

update_date: account update date //ngày cập nhật tài khoản

new_balance: current balance of product //số dư hiện tại

highest_balance: highest balance in history //số dư cao nhất trong lịch sử

report_date: date of recent payment //ngày thanh toán gần đây

customer_data.csv:

customer’s demographic data and category attributes which have been encoded.

Category features are fea1, fea3, fea5, fea6, fea7, fea9.

label is 1, the customer is in high credit risk

label is 0, the customer is in low credit risk


In [499]:
X=customer.drop('label',axis=1)
Y=customer['label']
np.random.seed(5)
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.3)


In [500]:
sc=StandardScaler()
Xtrain=sc.fit_transform(Xtrain)
Xtest=sc.transform(Xtest)
Xtrain
Xtest

array([[ 0.65759789, -0.34942734,  0.93493906, ...,  0.96149353,
        -0.63146772, -1.19393682],
       [ 0.6565058 , -0.34942734, -1.45626856, ...,  0.96149353,
         1.82168728,  1.11706051],
       [ 0.65754966, -0.34942734, -1.55320941, ...,  0.96149353,
        -0.11173012, -1.19393682],
       ...,
       [ 0.66222318,  1.09035965, -0.03446944, ...,  0.96149353,
        -0.62467812, -1.19393682],
       [ 0.66788095, -1.06932083, -0.13141029, ..., -1.36322355,
         1.12100592,  0.6594063 ],
       [-1.51151935, -0.34942734,  1.51658415, ..., -0.20086501,
        -0.70261934, -1.19393682]])

In [511]:

model=neighbors.KNeighborsClassifier(n_neighbors=16,p=1,weights="distance")
model.fit(Xtrain,Ytrain)
Ypredict=model.predict(Xtest)
100*accuracy_score(Ytest,Ypredict)

79.88165680473372

In [514]:
from sklearn.model_selection import GridSearchCV

k = np.arange(1, 20).tolist()
p = [1, 2]

param_grid = dict(n_neighbors = k, p = p)

clf = neighbors.KNeighborsClassifier()

grid_model = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 5)
Ypredict = grid_model.fit(Xtrain, Ytrain)

best_score, best_params =  Ypredict.best_score_, Ypredict.best_params_
print(best_score)

0.799250181407724


In [503]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(Xtrain,Ytrain)
Ypredict=model.predict(Xtest)
100*accuracy_score(Ytest,Ypredict)

79.88165680473372

In [504]:
from  sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=200,random_state=50)
model.fit(Xtrain,Ytrain)
Ypredict=model.predict(Xtest)
100*accuracy_score(Ytest,Ypredict)

79.58579881656804

In [505]:
import xgboost as xgb
model=xgb.XGBClassifier(n_estimators=20,random_state=5)
model.fit(Xtrain,Ytrain)
Ypredict=model.predict(Xtest)
100*accuracy_score(Ytest,Ypredict)





77.2189349112426

In [506]:
from  sklearn.ensemble import GradientBoostingClassifier
model=GradientBoostingClassifier(learning_rate=0.3,n_estimators=100,random_state=20)
model.fit(Xtrain,Ytrain)
Ypredict=model.predict(Xtest)
100*accuracy_score(Ytest,Ypredict)

76.92307692307693

In [507]:
# from sklearn.neighbors import KNeighborsClassifier
# cls=KNeighborsClassifier()
# cls.fit(Xtrain,Ytrain)
# Ypredict=cls.predict(Xtest)
# 100*accuracy_score(Ytest,Ypredict)

In [508]:
confusion_matrix(Ytest,Ypredict)


array([[249,  21],
       [ 57,  11]], dtype=int64)

In [509]:
# Xset,Yset=Xtrain,Ytrain
# x1,x2=np.meshgrid(np.arange(start=Xset[:,0].min()-1,stop=Xset[:,0].max()+1,step=0.01),
#                   np.arange(start=Xset[:,1].min()-1,stop=Xset[:,1].max()+1,step=0.01))
# plt.contourf(x1,x2,cls.predict(np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
#         alpha=0.75,cmap=ListedColormap(('red','green')))
# plt.xlim(x1.min(),x1.max())
# plt.ylim(x2.min(),x2.max())
# for i,j in enumerate(np.uniqur(Yset)):
#     plt.scatter(Xset[Yset==j,0],Xset[Yset==j,1],
#     c=ListedColormap(('red','green'))(i),label=j)