In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd 

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')

temp_df = {}
for i in range(10):
    temp_df['df_' + str(i)] = df[df['label'] == i].sample(5)

concat_df = pd.concat(temp_df.values()).reset_index(drop=True)

In [3]:
X, Y = concat_df.drop('label', axis=1).values, concat_df['label'].values

X = torch.tensor(X, dtype=torch.float32) / 255.0
Y = torch.tensor(Y, dtype=torch.long) 

In [4]:
import torch.nn.init as init

W1 = torch.randn(784, 256) * 0.01
b1 = torch.zeros(256)

W2 = torch.empty(256, 10) 
init.xavier_uniform_(W2)

b2 = torch.zeros(10)

parameters = [W1, b1, W2, b2]
for p in parameters:
  p.requires_grad = True

parameters = [W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

In [5]:
W1_ = []
W2_ = []

H_ = []
H_preact_ = []

logits_ = []
logits_gradients = []

In [6]:
for i in range(20000):

  for p in parameters:
    p.grad = None

  H_preact = X @ W1 + b1
  H = torch.tanh(H_preact)
  logits = H @ W2 + b2
  loss = F.cross_entropy(logits, Y)

  for j in [H_preact, W1, b1, H, W2, b2, logits]:
      j.retain_grad()
  loss.backward()

  for params in parameters:
     params.data += -0.1 * params.grad

  W1_.append(W1.clone())
  H_.append(H.clone())
  H_preact_.append(H_preact.clone())
  W2_.append(W2.clone())
  logits_gradients.append(logits.grad.clone())
  logits_.append(logits.clone())

  if i % 1000 == 0:
      print(f'iteration {i}/{20000}, loss: {loss.item()}')

iteration 0/20000, loss: 2.3240225315093994
iteration 1000/20000, loss: 0.002541974885389209
iteration 2000/20000, loss: 0.0011715571163222194
iteration 3000/20000, loss: 0.0007481785141862929
iteration 4000/20000, loss: 0.0005452451296150684
iteration 5000/20000, loss: 0.00042691529961302876
iteration 6000/20000, loss: 0.00034976081224158406
iteration 7000/20000, loss: 0.00029560725670307875
iteration 8000/20000, loss: 0.00025559798814356327
iteration 9000/20000, loss: 0.00022485379304271191
iteration 10000/20000, loss: 0.00020052740001119673
iteration 11000/20000, loss: 0.0001808411761885509
iteration 12000/20000, loss: 0.0001645154698053375
iteration 13000/20000, loss: 0.0001508639834355563
iteration 14000/20000, loss: 0.0001392289123032242
iteration 15000/20000, loss: 0.00012920751760248095
iteration 16000/20000, loss: 0.0001204970758408308
iteration 17000/20000, loss: 0.00011284735955996439
iteration 18000/20000, loss: 0.00010608914453769103
iteration 19000/20000, loss: 0.00010007

In [7]:
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

i_widget = widgets.IntSlider(min=0, max=19999, step=1, value=0)

def update_plot(I):
    plt.figure(figsize=(15, 10))

    plt.subplot(3, 2, 1)
    plt.hist(W1_[I].view(-1).detach(), bins=50)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of W1')

    plt.subplot(3, 2, 2)
    plt.hist(H_preact_[I].view(-1).detach(), bins=50)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of H preact')

    plt.subplot(3, 2, 3)
    plt.hist(H_[I].view(-1).detach(), bins=50)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of H')

    plt.subplot(3, 2, 4)
    plt.hist(W2_[I].view(-1).detach(), bins=50)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of W2')

    plt.subplot(3, 2, 5)
    plt.imshow((F.softmax(logits_[I].detach(), dim=1)).T, cmap='gray')
    plt.xlabel(f'Iteration : {I}')
    plt.ylabel('Logits')
    plt.title('Logits')

    plt.subplot(3, 2, 6)
    plt.imshow(logits_gradients[I].detach().T, cmap='gray')
    plt.xlabel(f'Iteration : {I}')
    plt.title('Logits Gradients')

    plt.tight_layout()
    plt.show()

widgets.interact(update_plot, I=i_widget)

interactive(children=(IntSlider(value=0, description='I', max=19999), Output()), _dom_classes=('widget-interacâ€¦

<function __main__.update_plot(I)>