# Analyze the MNIST DATA (5 vs 8)

## Load necessary libraries

In [85]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

## Data

### Load data 
- 70K images from 10 classes
- Each image is 28-by-28, stored in a 784 vector
- lable (y) takes 10 different values

In [86]:
mnist = fetch_openml('mnist_784')

# Extract features (pixel values) and target labels
X = mnist.data.astype('float32')
y = mnist.target.astype('int64')

In [87]:
print(X.head())

   pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   pixel10  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0      0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
1      0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
2      0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
3      0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
4      0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   

   pixel781  pixel782  pixel783  pixel784  
0       0.0       

In [62]:
print(y.head())

0    5
1    0
2    4
3    1
4    9
Name: class, dtype: int64


In [63]:
label_counts = np.bincount(y)
label_counts

array([6903, 7877, 6990, 7141, 6824, 6313, 6876, 7293, 6825, 6958],
      dtype=int64)

In [64]:
X.shape

(70000, 784)

Display 12 randomly selected images

### Filter data 
Filter the dataset to include only the digits 5 and 8. 

In [65]:
digit_filter = (y == 5) | (y == 8)
X = X[digit_filter]
y = y[digit_filter]

In [66]:
X.shape

(13138, 784)

In [67]:
np.unique(y)

array([5, 8], dtype=int64)

In [68]:
n5 = np.count_nonzero(y == 5)
n8 = np.count_nonzero(y == 8)

In [69]:
n5, n8

(6313, 6825)

In [70]:
y = y.replace({5 : 1, 8 : 0})
y.value_counts()

0    6825
1    6313
Name: class, dtype: int64

# Logistic Regression

60\% training and 40\% test

Try two different Logistic regression models:
- Logistic regression with all 784 features;
- Losgitic regression with 100 top PCs.

In [71]:
from sklearn.linear_model import LogisticRegression

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the logistic regression model
#model = LogisticRegression()
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9747526008627252
Test Accuracy: 0.9490106544901066


In [73]:
y_test_logits = model.decision_function(X_test)
y_test_prob = model.predict_proba(X_test)
print(y_test_logits[:10])
print(y_test_prob[:10])
print(y_test_pred[:10])

[  4.62493822  -4.52236797   4.89404422   5.94546226  12.81172949
   0.48844461 -12.62286427   4.16760299  -1.66570542   7.58588442]
[[9.70907044e-03 9.90290930e-01]
 [9.89253473e-01 1.07465269e-02]
 [7.43536680e-03 9.92564633e-01]
 [2.61085759e-03 9.97389142e-01]
 [2.72857185e-06 9.99997271e-01]
 [3.80260047e-01 6.19739953e-01]
 [9.99996704e-01 3.29578043e-06]
 [1.52530835e-02 9.84746916e-01]
 [8.41002401e-01 1.58997599e-01]
 [5.07308197e-04 9.99492692e-01]]
[1 0 1 1 1 1 0 1 0 1]


In [74]:
1/(1+ np.exp(-y_test_logits[0])), 1/(1+ np.exp(y_test_logits[0]))

(0.9902909295571708, 0.009709070442829145)

PCA + Logistic Regression: use top 100 PCs

In [75]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Create and train the logistic regression model on the PCA-transformed data
model = LogisticRegression(max_iter=500)
model.fit(X_train_pca, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test_pca)
y_train_pred = model.predict(X_train_pca)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9628266937325551
Test Accuracy: 0.9619482496194824


# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [76]:
import warnings

# Define the number of batches
num_batches = 11

# Calculate the batch size
batch_size = len(X_train) // num_batches

# Initialize an empty list to store the models
models = []

# Suppress warnings
warnings.filterwarnings("ignore")

# Split the training data into batches, fit a logistic regression model on each batch
for i in range(num_batches):
    # Calculate the starting and ending indices for the current batch
    start_index = i * batch_size
    end_index = (i + 1) * batch_size
    
    # Create a logistic regression model
    model = LogisticRegression()
    
    # Select the current batch for training
    X_batch = X_train[start_index:end_index]
    y_batch = y_train[start_index:end_index]
    
    # Fit the model on the current batch
    model.fit(X_batch, y_batch)
    
    # Append the trained model to the list
    models.append(model)

# Make predictions on the test set using majority voting
preds_voting = np.zeros(len(y_test))
# Make predictions on the test set using average of logit
preds_logit = np.zeros(len(y_test))

for model in models:
    # Make predictions using each model
    y_pred = model.predict(X_test)
    # Accumulate the predictions using majority voting
    preds_voting += (y_pred == 8)
    
    # Accumulate the predictions using majority voting
    y_pred = model.decision_function(X_test)
    preds_logit += y_pred
    
# Majority voting (selecting the most frequent prediction for each sample)
final_predictions = np.where(preds_voting > len(models) / 2, 8, 5)
accuracy = accuracy_score(y_test, final_predictions)
print("Majority Voting Accuracy:", accuracy)

# Average of logit
final_predictions = np.where(preds_logit > 0, 8, 5)
accuracy = accuracy_score(y_test, final_predictions)
print("Average Logit Accuracy:", accuracy)

Majority Voting Accuracy: 0.0
Average Logit Accuracy: 0.0


In [77]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

## Iteration Code

We need to repeat the process above 100 times and record the corresponding accuracies. Let's write a function of the process above and then call this function 100 times. 

In [78]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    #indices = np.random.permutation(len(X))
    
    #change 7/12
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Create a logistic regression model
        model = LogisticRegression(max_iter=500)
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Fit the model on the current batch
        model.fit(X_batch_scaled, y_batch)
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 1)
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
        
        #Accumulate the probs
        y_pred = model.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
   
    
    accuracy = np.zeros(4)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    model = LogisticRegression(max_iter=500)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy[3] = accuracy_score(y_test, y_pred)
    
    return accuracy

In [79]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty vector to store accuracies
accuracies = np.zeros((num_repeats, 4))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i] = iterate_process(X, y)
    seed += 2

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.96860731 0.96194825 0.96860731 0.95547945]
 [0.96575342 0.9598554  0.96594368 0.95509893]
 [0.96099696 0.96023592 0.96270928 0.95091324]
 [0.96213851 0.9608067  0.96118721 0.94805936]
 [0.9608067  0.96061644 0.96004566 0.95262557]
 [0.96213851 0.96232877 0.96366058 0.95300609]
 [0.96194825 0.96061644 0.9630898  0.95376712]
 [0.96118721 0.9608067  0.96232877 0.95281583]
 [0.96594368 0.96575342 0.96765601 0.95471842]
 [0.96251903 0.96537291 0.96518265 0.94920091]
 [0.96689498 0.96461187 0.96708524 0.95566971]
 [0.96289954 0.96156773 0.96442161 0.95566971]
 [0.96232877 0.96004566 0.96156773 0.95072298]
 [0.9608067  0.95681126 0.95928463 0.94901065]
 [0.96232877 0.9630898  0.96328006 0.95509893]
 [0.96023592 0.95890411 0.96023592 0.95547945]
 [0.96270928 0.96251903 0.96328006 0.95490868]
 [0.9608067  0.96004566 0.96137747 0.9543379 ]
 [0.96423135 0.96366058 0.96442161 0.95300609]
 [0.96289954 0.96194825 0.96270928 0.95148402]
 [0.96251903 0.96366058 0.9640411  0.94843988]
 

In [80]:
np.mean(accuracies, axis = 0)

array([0.96260274, 0.96145738, 0.96302892, 0.95265982])

In [81]:
np.std(accuracies, axis = 0)

array([0.00244367, 0.00261703, 0.00262762, 0.00301467])

In [82]:
len(X)

13138

In [83]:
len(y)

13138

In [84]:
X.shape

(13138, 784)