# default of credit card clients

Dataset is from https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients

n = 30000

23 features

In [2]:
import pandas as pd

In [3]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/default%20of%20credit%20card%20clients.csv"
df = pd.read_csv(url)

In [4]:
df.shape

(30000, 24)

In [5]:
print(df.head())

       X1  X2  X3  X4  X5  X6  X7  X8  X9  X10  ...    X15    X16    X17  \
0   20000   2   2   1  24   2   2  -1  -1   -2  ...      0      0      0   
1  120000   2   2   2  26  -1   2   0   0    0  ...   3272   3455   3261   
2   90000   2   2   2  34   0   0   0   0    0  ...  14331  14948  15549   
3   50000   2   2   1  37   0   0   0   0    0  ...  28314  28959  29547   
4   50000   1   2   1  57  -1   0  -1   0    0  ...  20940  19146  19131   

    X18    X19    X20   X21   X22   X23  Y  
0     0    689      0     0     0     0  1  
1     0   1000   1000  1000     0  2000  1  
2  1518   1500   1000  1000  1000  5000  0  
3  2000   2019   1200  1100  1069  1000  0  
4  2000  36681  10000  9000   689   679  0  

[5 rows x 24 columns]


In [6]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [7]:
X = df.drop("Y", axis=1)
y = df["Y"]

In [8]:
X.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [9]:
y.value_counts()

0    23364
1     6636
Name: Y, dtype: int64

In [10]:
#y = y.replace({"neg" : 0, "pos" : 1})
y.value_counts()

0    23364
1     6636
Name: Y, dtype: int64

## Logistic Regression

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the logistic regression model
#model = LogisticRegression()
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7780555555555555
Test Accuracy: 0.77975


# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [12]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    #indices = np.random.permutation(len(X))
    
    #change 7/12
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Create a logistic regression model
        model = LogisticRegression(max_iter=500)
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Fit the model on the current batch
        model.fit(X_batch_scaled, y_batch)
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 1)
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
        
        #Accumulate the probs
        y_pred = model.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
   
    
    accuracy = np.zeros(4)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    model = LogisticRegression(max_iter=500)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy[3] = accuracy_score(y_test, y_pred)
    
    return accuracy

Try the divide and conquer approaches 10 times, reporting the following error matrix. 
- 10-by-4
- col_1: majority voting
- col_2: average the logit
- col_3: average probabilities
- col_4: using the model trained on all the training data

In [13]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 4))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i] = iterate_process(X, y)
    seed += 2

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.80741667 0.80741667 0.8075     0.80816667]
 [0.80825    0.80891667 0.80891667 0.80941667]
 [0.809      0.8105     0.81058333 0.81158333]
 [0.81191667 0.81175    0.81166667 0.81266667]
 [0.81591667 0.81616667 0.81616667 0.81641667]
 [0.81058333 0.81166667 0.81158333 0.81225   ]
 [0.81016667 0.80908333 0.80891667 0.81016667]
 [0.81125    0.81133333 0.81141667 0.812     ]
 [0.81291667 0.81166667 0.81183333 0.812     ]
 [0.80941667 0.8085     0.8085     0.80833333]
 [0.81066667 0.81066667 0.81058333 0.81041667]
 [0.81025    0.80908333 0.80933333 0.81083333]
 [0.805      0.80541667 0.8055     0.80658333]
 [0.81516667 0.81383333 0.81383333 0.81441667]
 [0.80858333 0.80891667 0.809      0.80958333]
 [0.81       0.80975    0.80975    0.81025   ]
 [0.8065     0.80758333 0.8075     0.8075    ]
 [0.812      0.81041667 0.81025    0.81141667]
 [0.81208333 0.81208333 0.81208333 0.81325   ]
 [0.81266667 0.81225    0.81225    0.81275   ]
 [0.81066667 0.81075    0.81066667 0.81108333]
 

In [14]:
np.mean(accuracies, axis = 0)

array([0.80979333, 0.809755  , 0.80976333, 0.81035833])

In [15]:
np.std(accuracies, axis = 0)

array([0.0029566 , 0.00281101, 0.00281949, 0.00280482])