# Assignment 5
### Parker Christenson

#### `Customer Segmentation Analysis with Boltzmann Machines Based on Online Retail Shopping Habits`


___
1. Load the Online Retail II dataset and clean the data by removing incomplete entries. 

2. Preprocess the data by encoding categorical data and scaling numerical features to normalize the range of data values.

3. Transform the data into a suitable format where each customer's shopping habits over time are captured in a binary format—purchased or not purchased.

4. Train your Boltzmann machine using the training set with the goal of learning the underlying probability distribution of the data.


In [51]:
# imports

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import polars as pl
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [52]:
# Read first sheet
df1_polars = pl.read_excel('online_retail_II.xlsx')
df1_pandas = df1_polars.to_pandas()

# Read second sheet
df2_polars = pl.read_excel('online_retail_II.xlsx', sheet_name='Year 2010-2011')
df2_pandas = df2_polars.to_pandas()

# Combine both dataframes
df = pd.concat([df1_pandas, df2_pandas])
df.head()



Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434.0,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434.0,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434.0,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434.0,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434.0,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [53]:
# null values
df.isnull().sum()

Invoice         19500
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [54]:
# drop all rows with null values
df = df.dropna()

In [55]:
# binary encoding
df['Purchased'] = 1

In [56]:
# pivot the data
pivot_data = df.pivot_table(index='Customer ID', columns='StockCode', values='Purchased', fill_value=0)

In [57]:
# scale
scaler = StandardScaler()
scaled_pivot_data = scaler.fit_transform(pivot_data)

In [58]:
# convert scaled into data frame
scaled_pivot_data = pd.DataFrame(scaled_pivot_data, index=pivot_data.index, columns=pivot_data.columns)

# another check for nulls 
if pd.isna(scaled_pivot_data).any().any():
    raise ValueError("NaN detected in scaled input data")

# df.head 
scaled_pivot_data.head()

StockCode,10002,10080,10109,10120,10123C,10123G,10124A,10124G,10125,10133,...,BANK CHARGES,C2,D,DOT,M,PADS,POST,SP1002,TEST001,TEST002
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,-0.169371,-0.06266,-0.013041,-0.094451,-0.081706,-0.045218,-0.055408,-0.03915,-0.122538,-0.163435,...,-0.056932,-0.087811,-0.029171,-0.013041,-0.285419,-0.052231,-0.271954,-0.018444,38.330797,76.681158
12347.0,-0.169371,-0.06266,-0.013041,-0.094451,-0.081706,-0.045218,-0.055408,-0.03915,-0.122538,-0.163435,...,-0.056932,-0.087811,-0.029171,-0.013041,-0.285419,-0.052231,-0.271954,-0.018444,-0.026089,-0.013041
12348.0,-0.169371,-0.06266,-0.013041,-0.094451,-0.081706,-0.045218,-0.055408,-0.03915,-0.122538,-0.163435,...,-0.056932,-0.087811,-0.029171,-0.013041,-0.285419,-0.052231,3.67709,-0.018444,-0.026089,-0.013041
12349.0,-0.169371,-0.06266,-0.013041,-0.094451,-0.081706,-0.045218,-0.055408,-0.03915,-0.122538,-0.163435,...,-0.056932,-0.087811,-0.029171,-0.013041,-0.285419,-0.052231,3.67709,-0.018444,-0.026089,-0.013041
12350.0,-0.169371,-0.06266,-0.013041,-0.094451,-0.081706,-0.045218,-0.055408,-0.03915,-0.122538,-0.163435,...,-0.056932,-0.087811,-0.029171,-0.013041,-0.285419,-0.052231,3.67709,-0.018444,-0.026089,-0.013041


In [59]:
# convert to tensor 
scaled_pivot_data = scaled_pivot_data.values
scaled_pivot_data = torch.tensor(scaled_pivot_data, dtype=torch.float32)

In [60]:
# define the model 
class RBM(nn.Module):
    def __init__(self, n_visible, n_hidden):
        super(RBM, self).__init__()
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        # Xavier Initialization <-- I started to use lots of initalization methods, and I wasnt able to get any of them to work properly
        self.W = nn.Parameter(torch.randn(n_hidden, n_visible) * torch.sqrt(torch.tensor(2.0 / (n_visible + n_hidden))))
        self.h_bias = nn.Parameter(torch.zeros(n_hidden))
        self.v_bias = nn.Parameter(torch.zeros(n_visible))

    def sample_from_p(self, p):
        return torch.bernoulli(p)

    def v_to_h(self, v):
        p_h = torch.sigmoid(torch.matmul(v, self.W.t()) + self.h_bias)
        return p_h, self.sample_from_p(p_h)

    def h_to_v(self, h):
        p_v = torch.sigmoid(torch.matmul(h, self.W) + self.v_bias)
        return p_v, self.sample_from_p(p_v)

    def forward(self, v):
        p_h, h = self.v_to_h(v)
        p_v, v = self.h_to_v(h)
        return v

    def free_energy(self, v):
        v_term = torch.matmul(v, self.v_bias)
        w_x_h = torch.matmul(v, self.W.t()) + self.h_bias
        h_term = torch.sum(torch.log(1 + torch.exp(w_x_h)), dim=1)
        return -v_term - h_term

In [66]:
# define the model
n_visible = scaled_pivot_data.shape[1]
n_hidden = 256
rbm = RBM(n_visible, n_hidden)

# training the model 
n_epochs = 10
batch_size = 64
learning_rate = 0.001  # Further reduced learning rate

# setting the optimizer
optimizer = optim.SGD(rbm.parameters(), lr=learning_rate)

In [67]:

# custom loop with lots of debugging statements
for epoch in range(n_epochs):
    train_loss = 0
    for i in range(0, len(scaled_pivot_data), batch_size):
        batch = scaled_pivot_data[i:i+batch_size]
        if len(batch) != batch_size:
            continue
        
        # postive phase
        v0 = batch
        ph0, h0 = rbm.v_to_h(v0)

        # check for nans in pos phase 
        if torch.isnan(ph0).any() or torch.isnan(h0).any():
            print("NaN detected in positive phase")
            break

        # negative phase 
        vk = v0
        for k in range(1):
            _, hk = rbm.v_to_h(vk)
            _, vk = rbm.h_to_v(hk)

        phk, _ = rbm.v_to_h(vk)

        # check to see if the negative phasae has NaNs
        if torch.isnan(phk).any() or torch.isnan(vk).any():
            print("NaN detected in negative phase")
            break

        positive_phase = torch.matmul(h0.t(), v0)
        negative_phase = torch.matmul(phk.t(), vk)

        # update gradients
        rbm.W.grad = (positive_phase - negative_phase) / batch_size
        rbm.v_bias.grad = torch.sum(v0 - vk, dim=0) / batch_size
        rbm.h_bias.grad = torch.sum(ph0 - phk, dim=0) / batch_size

        # clip the gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(rbm.parameters(), max_norm=1)

        optimizer.step()

        train_loss += torch.mean(rbm.free_energy(v0)) - torch.mean(rbm.free_energy(vk))
    
    # nans in train loss causes loop to eand early
    if torch.isnan(train_loss).any():
        print("NaN detected in train_loss")
        break
    
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {train_loss.item()}')


Epoch 1/10, Loss: 3741.68505859375
Epoch 2/10, Loss: 6398.607421875
Epoch 3/10, Loss: 9253.9833984375
Epoch 4/10, Loss: 12266.40234375
Epoch 5/10, Loss: 15445.40234375
Epoch 6/10, Loss: 18800.091796875
Epoch 7/10, Loss: 22314.486328125
Epoch 8/10, Loss: 25941.3984375
Epoch 9/10, Loss: 29682.236328125
Epoch 10/10, Loss: 33581.9453125
