In [None]:
# !pip install datasets

In [None]:
# !pip install xgboost lightgbm catboost

In [None]:
# !pip install minisom #allows you to implement SOM in python/pytorch

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import time
# import timm # library with pretrained models
from datasets import load_dataset
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import Dataset
from minisom import MiniSom
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings
from sklearn.neural_network import BernoulliRBM #built in sklearn RBM with bernoulli units (both visible and hidden units are binary, so need to make sure data is normalized)


This script contains code to perform dimensionality reduction techniques:
1. Self Organizing Map 
2. Restricted Bolzman Machine 
3. Variational Auto Encoder 


- Fashion mnist data set
- each image is 28x28 pixels (784 pixels total)

In [None]:
#define normalization transforms so that data is between 0 and 1 (required for RBM and VAE)
#images are grayscale, they have a single channel, so normalize them with mean and standard deviation to scale pixel values between 0 and 1
transform = transforms.Compose([transforms.ToTensor()])

In [None]:
#load train set and test set of data
#these have been transformed and normalized so can just directly call these variables for the different dim reduction techniques
train_set = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_set = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)


In [None]:
print(train_set[0]) #data now between 0 and 1

1) Apply SOM, RBM, and one arbitrary Autoencoders (e.g. VAE) to it to reduce the dimension of the original data.


2) For each dim reduction technique
- Experiment with three classifier algorithms (XGBoost, LightGBM and CATBoost) and run them on the four datasets.
- Three dataset includes the original dataset, SOM, RBM, and one Autoencoder.
- Then compare the results. For classification comparison, you can use accuracy.


First do each classifier on the original dataset (un-reduced)

In [None]:
#first need to flatten data since images are 28x28
#this flattens them into 784 dimensional vectors
def flatten_data(data_loader):
    flattened_data = []
    for images, labels in data_loader:
        for image in images:
            flattened_data.append(image.numpy().ravel())
    return np.array(flattened_data)

train_loader = DataLoader(train_set, batch_size=1, shuffle=False)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False)

flattened_train_data = flatten_data(train_loader)
flattened_test_data = flatten_data(test_loader)

# Extract labels
train_labels = train_set.targets.numpy()
test_labels = test_set.targets.numpy()

In [None]:
#XGB on original data
# split the flattened data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(flattened_train_data, train_labels, test_size=0.2)

og_xgb_start = time.time()
#was taking forever to train, so potentially adjust the hyper paramters

# initialize and train xgb
xgb_model = xgb.XGBClassifier(n_estimators=50,max_depth=3) #taking forever so lower number of estimators from 100 to 50, reduce depth to 3
xgb_model.fit(X_train, Y_train)
og_xgb_accuracy = xgb_model.score(X_test, Y_test)
og_xgb_end = time.time()
og_xgb_run = og_xgb_end - og_xgb_start
print(f"Original Data XGB run time: {og_xgb_run} seconds")
print(f"Original Data XGBoost Accuracy: {og_xgb_accuracy}")


Original Data XGB run time: 212.8142101764679 seconds
Original Data XGBoost Accuracy: 0.8768333333333334


In [None]:
#LightGBM on original data
#tweaked params and now it runs in 5 minutes (ish)
X_train, X_test, Y_train, Y_test = train_test_split(flattened_train_data, train_labels, test_size=0.2)

og_lgb_start = time.time()
og_lgb_model = lgb.LGBMClassifier(verbose=-1,n_estimators=50,max_depth=3,subsample=.5) # verbose = -1 should suppress the flood of warnings
og_lgb_model.fit(X_train, Y_train)
og_lgb_accuracy = og_lgb_model.score(X_test, Y_test)
og_lgb_end = time.time()
og_lgb_run = og_lgb_end - og_lgb_start
#print accuracy and duration
print(f"Original Data LGBM run time: {og_lgb_run} seconds")
print(f"Original Data LightGBM Accuracy: {og_lgb_accuracy}")

Original Data LGBM run time: 91.79607462882996 seconds
Original Data LightGBM Accuracy: 0.8630833333333333


In [None]:
#CatBoost on original data
X_train, X_test, Y_train, Y_test = train_test_split(flattened_train_data, train_labels, test_size=0.2)

og_cb_start = time.time()
cb_model = cb.CatBoostClassifier(verbose=0, n_estimators=50,max_depth=3) #doesnt support subsample param
cb_model.fit(X_train, Y_train)
og_cb_accuracy = cb_model.score(X_test, Y_test)
og_cb_end = time.time()
og_cb_run = og_cb_end - og_cb_start
print(f"Original data catboost run time: {og_cb_run} seconds")
print(f" Original Data CatBoost Accuracy: {og_cb_accuracy}")

Original data catboost run time: 71.28506112098694 seconds
 Original Data CatBoost Accuracy: 0.8323333333333334


In [None]:
#create df for original dataset models and accuracy
og_dic = {"OG Model":["XGBoost",'Light GBM',"CatBoost"],"Accuracy":[og_xgb_accuracy,og_lgb_accuracy,og_cb_accuracy],"Duration":[og_xgb_run,og_lgb_run,og_cb_run]}
rbm_df = pd.DataFrame(og_dic)
print(rbm_df)

    OG Model  Accuracy    Duration
0    XGBoost  0.876833  212.814210
1  Light GBM  0.863083   91.796075
2   CatBoost  0.832333   71.285061


Self organizing map (SOM)
- used to reduce dimensionality of data
- output of som is a topographical map which represnts data in lower dimension

In [None]:
# initialize and train the SOM
#miniSom pretrained model
som = MiniSom(6, 6, 784, sigma=0.3, learning_rate=0.5)
som.train(flattened_train_data, num_iteration=100)

# now use SOM to transform data
#time the SOM
som_transform_start = time.time()
def transform_with_som(som, data):
    transformed_data = [som.winner(vec) for vec in data]
    return np.array(transformed_data)

som_x_train_transformed = transform_with_som(som, flattened_train_data)
som_x_test_transformed = transform_with_som(som, flattened_test_data)
som_transform_end = time.time()
som_transform_run = som_transform_end - som_transform_start
print(f"som data transform training duration {som_transform_run}")

som data transform training duration 5.172396183013916


In [None]:
# split som transformed data for XGB model
X_train, X_test, Y_train, Y_test = train_test_split(som_x_train_transformed, train_labels, test_size=0.2)

som_xgb_start = time.time()
# initialize and train xgb
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train)
som_xgb_accuracy = xgb_model.score(X_test, Y_test)
som_xgb_end = time.time()
som_xgb_run = som_xgb_end - som_xgb_start
#print accuracy and duration
print(f"SOM XGB run time: {som_xgb_run} seconds")
print(f"XGBoost Accuracy: {som_xgb_accuracy}")

SOM XGB run time: 6.8260111808776855 seconds
XGBoost Accuracy: 0.329


In [None]:
#lgb on som
X_train, X_test, Y_train, Y_test = train_test_split(som_x_train_transformed, train_labels, test_size=0.2)

som_lgb_start = time.time()
lgb_model = lgb.LGBMClassifier(verbose=-1) # verbose = -1 should suppress warnings once it's done
lgb_model.fit(X_train, Y_train)
som_lgb_accuracy = lgb_model.score(X_test, Y_test)
som_lgb_end = time.time()
som_lgb_run = som_lgb_end - som_lgb_start
#print accuracy and duration
print(f"SOM LGBM run time: {som_lgb_run} seconds")
print(f"LightGBM Accuracy: {som_lgb_accuracy}")

SOM LGBM run time: 4.239447593688965 seconds
LightGBM Accuracy: 0.32508333333333334


In [None]:
#catboost on som
X_train, X_test, Y_train, Y_test = train_test_split(som_x_train_transformed, train_labels, test_size=0.2)

som_cb_start = time.time()
cb_model = cb.CatBoostClassifier(verbose=0)
cb_model.fit(X_train, Y_train)
som_cb_accuracy = cb_model.score(X_test, Y_test)
som_cb_end = time.time()
som_cb_run = som_cb_end - som_cb_start
print(f"SOM catboost run time: {som_cb_run} seconds")
print(f"CatBoost Accuracy: {som_cb_accuracy}")

SOM catboost run time: 64.07290768623352 seconds
CatBoost Accuracy: 0.32675


In [None]:
#store som model accuraccies in dictionary
som_dic = {"SOM Model":["XGBoost",'Light GBM',"CatBoost"],"Accuracy":[som_xgb_accuracy,som_lgb_accuracy,som_cb_accuracy],"Duration":[som_xgb_run,som_lgb_run,som_cb_run]}
som_df = pd.DataFrame(som_dic)
print(som_df)

   SOM Model  Accuracy   Duration
0    XGBoost  0.329000   6.826011
1  Light GBM  0.325083   4.239448
2   CatBoost  0.326750  64.072908


Restricted Bolzman Machine (RBM)
- no connection between neruons of the same layer
- but there are connections from each invisible neuron to each hidden neuron and vice versa
- used sklearn built in bernoulli RBM, menaing both visbile and hidden units are binary (have states of 0 or 1). Use it on binary or normalized data (I need to make sure the data transform includes normalization)

In [None]:
#initialize bernoulli rbm on normalized/transformed data now between 0 and 1
#this one takes a minute to train (time it)
rbm_transform_start = time.time()
rbm = BernoulliRBM(n_components=100, learning_rate=0.01, n_iter=5, verbose=True)
#train rbm
rbm.fit(flattened_train_data)
rbm_transform_end = time.time()
rbm_transform_run = rbm_transform_end - rbm_transform_start
print(f"rbm data transform training duration {rbm_transform_run}")

[BernoulliRBM] Iteration 1, pseudo-likelihood = -251.85, time = 12.87s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -237.06, time = 13.97s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -224.46, time = 13.42s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -216.23, time = 11.55s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -208.86, time = 13.41s
rbm data transform training duration 66.23604893684387


In [None]:
print(flattened_train_data.shape)

In [None]:
#dim reduction of data using trained RBM
rbm_x_train_transformed = rbm.transform(flattened_train_data)
rbm_x_test_transformed = rbm.transform(flattened_test_data)

In [None]:
#implement xgb on rbm transformed data
X_train, X_test, Y_train, Y_test = train_test_split(rbm_x_train_transformed, train_labels, test_size=.2)

rbm_xgb_start = time.time()
# initialize and train xgb on rbm data
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train)
rbm_xgb_accuracy = xgb_model.score(X_test, Y_test)
rbm_xgb_end = time.time()
rbm_xgb_run = rbm_xgb_end - rbm_xgb_start
#print accuracy and duration
print(f"RBM XGB run time: {rbm_xgb_run} seconds")
print(f"RBM XGBoost Accuracy: {rbm_xgb_accuracy}")

RBM XGB run time: 67.01226925849915 seconds
RBM XGBoost Accuracy: 0.8704166666666666


In [None]:
#lightgbm on rbm data
X_train, X_test, Y_train, Y_test = train_test_split(rbm_x_train_transformed, train_labels, test_size=0.2)

rbm_lgb_start = time.time()
lgb_model = lgb.LGBMClassifier(verbose=-1) # verbose = -1 should suppress the flood of warnings
lgb_model.fit(X_train, Y_train)
rbm_lgb_accuracy = lgb_model.score(X_test, Y_test)
rbm_lgb_end = time.time()
rbm_lgb_run = rbm_lgb_end - rbm_lgb_start
print(f"RBM LGBM run time: {rbm_lgb_run} seconds")
print(f"RBM LightGBM Accuracy: {rbm_lgb_accuracy}")

RBM LGBM run time: 50.01668334007263 seconds
RBM LightGBM Accuracy: 0.8630833333333333


In [None]:
#Catboost on rbm data
#just from running I can tell this will probably be the model with longest duration
X_train, X_test, Y_train, Y_test = train_test_split(rbm_x_train_transformed, train_labels, test_size=0.2)

#start
rbm_cb_start = time.time()
cb_model = cb.CatBoostClassifier(verbose=0)
cb_model.fit(X_train, Y_train)
rbm_cb_accuracy = cb_model.score(X_test, Y_test)
rbm_cb_end = time.time()
rbm_cb_run = rbm_cb_end - rbm_cb_start
#print accuracy and duration
print(f"RBM catboost run time: {rbm_cb_run} seconds")
print(f"RBM CatBoost Accuracy: {rbm_cb_accuracy}")

RBM catboost run time: 995.4395568370819 seconds
RBM CatBoost Accuracy: 0.8709166666666667


In [None]:
#store rbm accuraccies in dictionary
rbm_dic = {"RBM Model":["XGBoost",'Light GBM',"CatBoost"],"Accuracy":[rbm_xgb_accuracy,rbm_lgb_accuracy,rbm_cb_accuracy],"Duration":[rbm_xgb_run,rbm_lgb_run,rbm_cb_run]}
rbm_df = pd.DataFrame(rbm_dic)
print(rbm_df)

   RBM Model  Accuracy    Duration
0    XGBoost  0.870417   67.012269
1  Light GBM  0.863083   50.016683
2   CatBoost  0.870917  995.439557


Variational Auto Enconder (VAE)
 - takes distributions of the orginal dataset and reconstruct the dataset from these distributions (instead of directly from input data)
 - does not assign a single value to each feature, rather assigns a probability
 - good for high dimensional or noisy data

In [None]:
#define VAE class
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()

        # encoder part to compress data into embedding (latent representation)
        self.fc1 = nn.Linear(784, 400) #transforms input vector of 784 to latent/hidden layer size of 400
        self.fc21 = nn.Linear(400, 20)  #outputs the mean vector of the latent distribution
        self.fc22 = nn.Linear(400, 20)  #outputs the log variance vector of latent distribution

        # decoder part to reconstruct input data from latent representation
        #these fully connected layers that upscale the data from the 20 dim latent vector back to the orignal 784 input size
        self.fc3 = nn.Linear(20, 400)
        self.fc4 = nn.Linear(400, 784)


    def encode(self, x):
      #applies the first part of the encoder and uses that output to compute mean and log variance vectors
        h1 = torch.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
      #performs reparamaterization trick so model can backpropogate through random samples (from latent distribution)
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
      #applies the decoder part of the VAE to reconstruct input data from latent representation
        h3 = torch.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3)) #makes sure values are range of 0 to 1

    #forward pass
    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, 784))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar


In [None]:
#train the vae
#this one took awhile to train
# define loss function
num_epochs = 1
def vae_loss_function(recon_x, x, mu, logvar):
    BCE = torch.nn.functional.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

vae_transform_start = time.time()
# initialize the vae and optimizer
vae = VAE()
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

# define training loop
for epoch in range(num_epochs):
    for batch_idx, (data, _) in enumerate(train_loader):
        optimizer.zero_grad()
        recon_batch, mu, logvar = vae(data)
        loss = vae_loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        optimizer.step()

vae_transform_end = time.time()
vae_transform_run = vae_transform_end - vae_transform_start
print(f"vae data transform training duration {vae_transform_run}")

vae data transform training duration 828.7021872997284


In [None]:
#now use encoder the portion of vae to trasnform data
def transform_with_vae(vae, data_loader):
    vae.eval()
    transformed_data = []
    with torch.no_grad():
        for data, _ in data_loader:
            mu, logvar = vae.encode(data.view(-1, 784))
            z = vae.reparameterize(mu, logvar)
            transformed_data.extend(z.numpy())
    return np.array(transformed_data)

vae_x_train_transformed = transform_with_vae(vae, train_loader)
vae_x_test_transformed = transform_with_vae(vae, test_loader)


In [None]:
#now input vae transformed data into xgb
X_train, X_test, Y_train, Y_test = train_test_split(vae_x_train_transformed, train_labels, test_size=.2)

vae_xgb_start = time.time()
# initialize and train xgb on vae transformed data
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train)
vae_xgb_accuracy = xgb_model.score(X_test, Y_test)
vae_xgb_end = time.time()
vae_xgb_run = vae_xgb_end - vae_xgb_start
# Print accuracy scores
print(f"VAE XGB run time: {vae_xgb_run} seconds")
print(f"VAE XGBoost Accuracy: {vae_xgb_accuracy}")

VAE XGB run time: 13.416758298873901 seconds
VAE XGBoost Accuracy: 0.7863333333333333


In [None]:
#vae lightgbm
X_train, X_test, Y_train, Y_test = train_test_split(vae_x_train_transformed, train_labels, test_size=0.2)

vae_lgb_start = time.time()
# initialize and train lgb on vae transformed data
lgb_model = lgb.LGBMClassifier(verbose=-1) # verbose = -1 should suppress the flood of warnings
lgb_model.fit(X_train, Y_train)
vae_lgb_accuracy = lgb_model.score(X_test, Y_test)
vae_lgb_end = time.time()
vae_lgb_run = vae_lgb_end - vae_lgb_start
print(f"VAE LGBM run time: {vae_lgb_run} seconds")
print(f"VAE LightGBM Accuracy: {vae_lgb_accuracy}")

VAE LGBM run time: 11.01946234703064 seconds
VAE LightGBM Accuracy: 0.7949166666666667


In [None]:
#vae catboost
X_train, X_test, Y_train, Y_test = train_test_split(vae_x_train_transformed, train_labels, test_size=0.2)

vae_cb_start = time.time()
# initialize and train catboost on vae transformed data
cb_model = cb.CatBoostClassifier(verbose=0)
cb_model.fit(X_train, Y_train)
vae_cb_accuracy = cb_model.score(X_test, Y_test)
vae_cb_end = time.time()
vae_cb_run = vae_cb_end - vae_cb_start
print(f"VAE catboost run time: {vae_cb_run} seconds")
print(f"VAE CatBoost Accuracy: {vae_cb_accuracy}")

VAE catboost run time: 203.94560265541077 seconds
VAE CatBoost Accuracy: 0.7929166666666667


In [None]:
#create vae dic
vae_dic = {"VAE Model":["XGBoost",'Light GBM',"CatBoost"],"Accuracy":[vae_xgb_accuracy,vae_lgb_accuracy,vae_cb_accuracy],"Duration":[vae_xgb_run,vae_lgb_run,vae_cb_run]}
vae_df = pd.DataFrame(vae_dic)
print(vae_df)

   VAE Model  Accuracy    Duration
0    XGBoost  0.786333   13.416758
1  Light GBM  0.794917   11.019462
2   CatBoost  0.792917  203.945603


In [None]:
#create dic for all the dim reduction model training duration
training_dic = {"Model":['SOM',"RBM","VAE"],"Training Duration":[som_transform_run,rbm_transform_run,vae_transform_run]}
training_df = pd.DataFrame(training_dic)
print(training_df)

  Model  Training Duration
0   SOM           5.172396
1   RBM          66.236049
2   VAE         828.702187
