In [97]:
#import libraries
import pandas as pd #data processing
import numpy as np #math

import seaborn as sns #data visualisation
from matplotlib import pyplot as plt #plotting
plt.style.use('ggplot')
from mpl_toolkits.mplot3d import Axes3D #make the 3D
from sklearn.model_selection import train_test_split #split the model

#data processing

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE #upsampling - SMOTE
from sklearn import preprocessing #data preprocessing - normalize/standardize

#models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier #random forest classifier
from sklearn.svm import SVC #svm

#evaluation
from sklearn.metrics import roc_auc_score #area under ROC
import statsmodels.api as sm


# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore')


In [2]:
 #i/o and data preparation
card = pd.read_csv("card.csv", header = 1, index_col = 0)

default = card[card['default payment next month'] == 1]
no_default = card[card['default payment next month'] == 0]

#categorical variables: SEX, EDUCATION, MARRIAGE, PAY_0, ..., PAY_6
#make these variables into categorical
card["SEX"] = card["SEX"].astype('category')
card["EDUCATION"] = card["EDUCATION"].astype('category')
card["MARRIAGE"] = card["MARRIAGE"].astype('category')
card["PAY_0"] = card["PAY_0"].astype('category')
card["PAY_2"] = card["PAY_2"].astype('category')
card["PAY_3"] = card["PAY_3"].astype('category')
card["PAY_4"] = card["PAY_4"].astype('category')
card["PAY_5"] = card["PAY_5"].astype('category')
card["PAY_6"] = card["PAY_6"].astype('category')
#card["default payment next month"] = card["default payment next month"].astype('category')


In [3]:
#split up the data frame into rich and poor, with rich being those with LIMIT_BAL > 300000
rich = card[card["LIMIT_BAL"] > 300000]
poor = card.loc[~card.index.isin(rich.index),:]

In [None]:
#frequency
for i in rich,poor:
    defaults = pd.value_counts(i['default payment next month'], sort = True)
    ax = defaults.plot(kind = 'bar', rot = 0, color = ['coral','b'])
    # create a list to collect the plt.patches data
    totals = []

    # find the values and append to list
    for j in ax.patches:
        totals += [j.get_height(),]

    # set individual bar lables using above list
    total = sum(totals)

    # set individual bar lables using above list
    for j in ax.patches:
        # get_x pulls left or right; get_height pushes up or down
        ax.text(j.get_x() + 0.1, j.get_height(), \
                str(round((j.get_height()/total)*100, 2))+'%', fontsize=15,
                    color='dimgrey')

    
    plt.title("Default for " + ('rich' if i.equals(rich) else 'poor') + ' people, amount = ' + str(len(i)))
    
    plt.xlabel("Default")
    plt.ylabel("Frequency")
    plt.show()
    

In [None]:
#correlation matrix

for i in rich,poor:
    label = "rich" if i.equals(rich) else 'poor'
    default, no_default = i[i['default payment next month'] == 1],i[i['default payment next month'] == 0]
    for j in default,no_default:
        default_status = "default" if j.equals(default) else "do not default" 
        correlation_matrix = j.corr(method = 'pearson')
        fig = plt.figure(figsize=(12,9))
        plt.title("Correlation matrix for " + label + " people who " + default_status)
        mask = np.zeros_like(correlation_matrix, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        sns.heatmap(correlation_matrix,vmax=1, vmin = -1, cmap = 'coolwarm', annot = True, mask = mask, square = True)
        plt.show()
        

In [91]:
#do logistic regression

for i in rich,poor:
    label = 'rich' if i.equals(rich) else 'poor'
    
    features = i.drop(['default payment next month'], axis = 1)

    feature_names = features.columns
    
    #normalize the features
    features = preprocessing.normalize(features)
    features = pd.DataFrame(data = features, columns = feature_names)
    #do we need to standardize though?
    
    #split the data and normalize it
    train_features, test_features,\
    train_target, test_target = train_test_split(features, i['default payment next month'],
                                                 test_size = 2/3, random_state = 123)
    
    #split into the validation set    
    x_train, x_val, y_train, y_val = train_test_split(train_features, train_target,
                                                  test_size = .2,
                                                  random_state=123) #20-80 split for validation set
    
    
    
    #do resampling -- upsampling using SMOTE
    #we will not be doing downsampling because the training data set will become quite small.
    x_train_res, y_train_res = SMOTE(random_state = 123).fit_sample(x_train, y_train)
    
    x_train = pd.DataFrame(data = x_train_res, columns = feature_names)
    y_train = pd.DataFrame(data = y_train_res, columns = ["default payment next month"])

    
    #model: logistic regression
    logreg = LogisticRegression(random_state = 123, solver = 'lbfgs')
        
    #feature selection -- using Recursive Feature Elminiation with Cross Validation
    rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(10)
                                   ,scoring='accuracy') #this can be changed to determine which feature is best
    rfecv.fit(x_train, y_train.values.ravel())

    

    print(label)
    #print('Optimal number of features: {}'.format(rfecv.n_features_))

    ranking = rfecv.ranking_
    headings = list(card)
    ranking_index = [i for i in range(len(ranking)) if ranking[i] == 1]

    selected_features = [headings[i] for i in ranking_index]
    
    #we only want to look at these features
    x = x_train[selected_features]
       
    logit_model=sm.Logit(y_train,x)
    result=logit_model.fit()

    p_values = result.pvalues
    features_to_keep = [i for i in range(len(p_values)) if p_values[i] < 0.05] #keep those with p-values smaller than 0.05
    selected_features = [selected_features[i] for i in features_to_keep]

    
    x = x_train[selected_features]
    test_features = test_features[selected_features]
    
    #the model
    logreg = LogisticRegression(random_state = 123, solver = 'lbfgs').fit(x, y_train.values.ravel())
    prediction = logreg.predict(test_features)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_features, test_target)))
        
    from sklearn.metrics import classification_report
    print(classification_report(test_target, prediction))
    

rich
Optimization terminated successfully.
         Current function value: 0.614139
         Iterations 7
Accuracy of logistic regression classifier on test set: 0.63
              precision    recall  f1-score   support

           0       0.93      0.63      0.75      2637
           1       0.22      0.68      0.33       405

    accuracy                           0.63      3042
   macro avg       0.57      0.65      0.54      3042
weighted avg       0.83      0.63      0.69      3042

poor
Optimization terminated successfully.
         Current function value: 0.669164
         Iterations 5
Accuracy of logistic regression classifier on test set: 0.59
              precision    recall  f1-score   support

           0       0.81      0.61      0.70     12989
           1       0.30      0.54      0.38      3969

    accuracy                           0.59     16958
   macro avg       0.55      0.57      0.54     16958
weighted avg       0.69      0.59      0.62     16958



In [95]:
#do random forest classifier
for i in rich,poor:
    label = 'rich' if i.equals(rich) else 'poor'
    
    features = i.drop(['default payment next month'], axis = 1)

    feature_names = features.columns
    
    #normalize the features
    features = preprocessing.normalize(features)
    features = pd.DataFrame(data = features, columns = feature_names)
    #do we need to standardize though?
    
    #split the data and normalize it
    train_features, test_features,\
    train_target, test_target = train_test_split(features, i['default payment next month'],
                                                 test_size = 2/3, random_state = 123)
    
    #split into the validation set    
    x_train, x_val, y_train, y_val = train_test_split(train_features, train_target,
                                                  test_size = .2,
                                                  random_state=123) #20-80 split for validation set
    
    
    
    #do resampling -- upsampling using SMOTE
    #we will not be doing downsampling because the training data set will become quite small.
    x_train_res, y_train_res = SMOTE(random_state = 123).fit_sample(x_train, y_train)
    
    x_train = pd.DataFrame(data = x_train_res, columns = feature_names)
    y_train = pd.DataFrame(data = y_train_res, columns = ["default payment next month"])

    
    #model: random forest classifier
    rfc = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 123)
        
    #feature selection -- using Recursive Feature Elminiation with Cross Validation
    rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10)
                                   ,scoring='accuracy') #this can be changed to determine which feature is best
    rfecv.fit(x_train, y_train.values.ravel())

    

    print(label)
    #print('Optimal number of features: {}'.format(rfecv.n_features_))

    ranking = rfecv.ranking_
    headings = list(card)
    ranking_index = [i for i in range(len(ranking)) if ranking[i] == 1]

    selected_features = [headings[i] for i in ranking_index]
    
    #we only want to look at these features
    x = x_train[selected_features]
       
    logit_model=sm.Logit(y_train,x)
    result=logit_model.fit()

    p_values = result.pvalues
    features_to_keep = [i for i in range(len(p_values)) if p_values[i] < 0.05] #keep those with p-values smaller than 0.05
    selected_features = [selected_features[i] for i in features_to_keep]

    
    x = x_train[selected_features]
    test_features = test_features[selected_features]
    
    #the model
    rfc = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 123).fit(x, y_train.values.ravel())
    prediction = rfc.predict(test_features)
    print('Accuracy of random forest classifier on test set: {:.2f}'.format(rfc.score(test_features, test_target)))
        
    from sklearn.metrics import classification_report
    print(classification_report(test_target, prediction))
    

rich
         Current function value: 12.447000
         Iterations: 35
Accuracy of random forest classifier on test set: 0.80




              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2637
           1       0.28      0.32      0.30       405

    accuracy                           0.80      3042
   macro avg       0.59      0.60      0.59      3042
weighted avg       0.81      0.80      0.81      3042

poor
         Current function value: inf
         Iterations: 35


  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Accuracy of random forest classifier on test set: 0.77
              precision    recall  f1-score   support

           0       0.84      0.87      0.85     12989
           1       0.52      0.45      0.48      3969

    accuracy                           0.77     16958
   macro avg       0.68      0.66      0.67     16958
weighted avg       0.76      0.77      0.77     16958



In [98]:
#do support vector machine
for i in rich,poor:
    label = 'rich' if i.equals(rich) else 'poor'
    
    features = i.drop(['default payment next month'], axis = 1)

    feature_names = features.columns
    
    #normalize the features
    features = preprocessing.normalize(features)
    features = pd.DataFrame(data = features, columns = feature_names)
    #do we need to standardize though?
    
    #split the data and normalize it
    train_features, test_features,\
    train_target, test_target = train_test_split(features, i['default payment next month'],
                                                 test_size = 2/3, random_state = 123)
    
    #split into the validation set    
    x_train, x_val, y_train, y_val = train_test_split(train_features, train_target,
                                                  test_size = .2,
                                                  random_state=123) #20-80 split for validation set
    
    
    
    #do resampling -- upsampling using SMOTE
    #we will not be doing downsampling because the training data set will become quite small.
    x_train_res, y_train_res = SMOTE(random_state = 123).fit_sample(x_train, y_train)
    
    x_train = pd.DataFrame(data = x_train_res, columns = feature_names)
    y_train = pd.DataFrame(data = y_train_res, columns = ["default payment next month"])

    
    #model: random forest classifier
    rfc = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 123)
        
    #feature selection -- using Recursive Feature Elminiation with Cross Validation
    rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10)
                                   ,scoring='accuracy') #this can be changed to determine which feature is best
    rfecv.fit(x_train, y_train)

    

    print(label)
    #print('Optimal number of features: {}'.format(rfecv.n_features_))

    ranking = rfecv.ranking_
    headings = list(card)
    ranking_index = [i for i in range(len(ranking)) if ranking[i] == 1]

    selected_features = [headings[i] for i in ranking_index]
    
    #we only want to look at these features
    x = x_train[selected_features]
       
    logit_model=sm.Logit(y_train,x)
    result=logit_model.fit()

    p_values = result.pvalues
    features_to_keep = [i for i in range(len(p_values)) if p_values[i] < 0.05] #keep those with p-values smaller than 0.05
    selected_features = [selected_features[i] for i in features_to_keep]

    
    x = x_train[selected_features]
    test_features = test_features[selected_features]
    
    #the model
    svm = SVC(kernel = 'rbf',  random_state = 123).fit(x, y_train)
    prediction = svm.predict(test_features)
    print('Accuracy of support vector machine on test set: {:.2f}'.format(svm.score(test_features, test_target)))
        
    from sklearn.metrics import classification_report
    print(classification_report(test_target, prediction))
    

rich
         Current function value: 12.447000
         Iterations: 35
Accuracy of support vector machine on test set: 0.76
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      2637
           1       0.19      0.23      0.21       405

    accuracy                           0.76      3042
   macro avg       0.53      0.54      0.54      3042
weighted avg       0.79      0.76      0.78      3042

poor
         Current function value: inf
         Iterations: 35
Accuracy of support vector machine on test set: 0.55
              precision    recall  f1-score   support

           0       0.81      0.54      0.64     12989
           1       0.28      0.58      0.38      3969

    accuracy                           0.55     16958
   macro avg       0.54      0.56      0.51     16958
weighted avg       0.68      0.55      0.58     16958



In [None]:
# summary statistics, we do this to look out for anomalies in the test/training data, so that we can be careful during learning
#for testing data
test.describe()
#see the skewness 
#print(test.skew(axis = 0,numeric_only = True))
#print(test.kurt(axis = 0,numeric_only = True))

for i in test:
    
    if not hasattr(test[i],'cat'):
        #plot histogram
        plt.hist(test[i], bins = 50)
        plt.title(i)
        plt.ylabel("Number of people")
        plt.xlabel("Amount")
        plt.show()
        #plot box and whisker plot
        # plt.boxplot(test[i])
        #plt.title(i)
        #plt.ylabel("Number of people")
        #plt.xlabel("Amount")
        #plt.show()
        

In [None]:
plt.scatter(data[""])

In [None]:
months = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]

for month in months:
    a = test[month].unique()
    a.sort()
    for i in a:
        plt.figure(figsize = (10,10))
        #ax = fig.add_subplot(111, projection='3d')
        data = test[test[month]==i]

        #defaulter bitch
        plt.scatter(data["BILL_AMT1"][data['default payment next month'] == 1],
                    data["PAY_AMT1"][data['default payment next month'] == 1],
                    c = 'red',label = '1', marker = '.')
        plt.xlabel('BILL_AMT1')
        plt.ylabel('PAY_AMT1')

        #no default pro
        plt.scatter(data["BILL_AMT1"][data['default payment next month'] == 0],
                    data["PAY_AMT1"][data['default payment next month'] == 0],
                    c = 'blue',label = '0', marker = '+', alpha = 0.3)
        plt.xlabel('BILL_AMT1')
        plt.ylabel('PAY_AMT1')

        plt.title(month + "= " + str(i))
        plt.legend()
        plt.show()