# Calculating perplexity

In [1]:
import numpy as np

In [2]:
np.random.seed(32)

In [3]:
predictions = np.load('predictions.npy')
targets = np.load('targets.npy')

In [4]:
print(f'predictions has shape: {predictions.shape}')
print(f'targets has shape: {targets.shape}')

predictions has shape: (32, 64, 256)
targets has shape: (32, 64)


In [6]:
predictions[0, 0, :]

array([-15.579997 , -25.735575 , -15.576893 , -15.575438 , -15.578    ,
       -15.5671   , -15.569961 , -15.577178 , -15.577527 , -16.853582 ,
       -15.579068 , -15.572912 , -15.583867 , -15.582075 , -15.572544 ,
       -15.590691 , -15.580892 , -15.564224 , -15.581494 , -15.575191 ,
       -15.580459 , -15.570125 , -15.570762 , -15.573637 , -15.569896 ,
       -15.563946 , -15.577015 , -15.578933 , -15.570032 , -15.569778 ,
       -15.577312 , -15.583919 , -14.783871 , -19.375536 , -15.56463  ,
       -15.583172 , -16.378674 , -15.570613 , -10.78342  ,  -4.743763 ,
        -6.0541244, -26.079489 , -15.568203 , -15.573162 , -22.60177  ,
        -8.983564 , -23.188797 , -15.576623 , -15.369985 ,  -7.5411735,
        -8.133133 ,  -8.58906  , -11.534666 , -12.234286 , -12.028591 ,
       -11.054028 , -12.349669 , -11.166466 , -30.400703 , -28.643908 ,
       -15.573648 , -15.583812 , -15.5819   , -21.409838 , -15.566822 ,
        -2.3704128,  -3.0276651,  -2.9461555,  -3.9008489,  -4.5

In [8]:
targets[0]

array([105, 110,  32, 115, 117,  99, 104,  32, 100, 105, 115, 100,  97,
       105, 110, 102, 117, 108,  32, 109,  97, 110, 110, 101, 114,  32,
       109, 101,  32, 116, 111,  32, 119, 111, 111,  46,   1,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

## Commentary

The predictions tensor has a shape of (32, 64, 256). This structure indicates that there are 32 sequences, each containing 64 tokens. Each token is represented by a vector of 256 log probabilities, where each probability corresponds to the likelihood of a particular token being predicted, based on its index in the vocabulary.

The targets tensor has a different shape, (32, 64), representing 32 sentences with each sentence comprising 64 tokens. Each token is an integer between 0 and 255, representing an index in the vocabulary. To align with the format of the predictions, we will convert these indices into one-hot vectors. In each one-hot vector, the position corresponding to the token's original index value will be set to 1.

In [12]:
reshaped_targets = np.eye(256)[targets]

In [13]:
reshaped_targets.shape

(32, 64, 256)

In [14]:
log_p = np.sum(predictions * reshaped_targets, axis=-1)

In [24]:
log_p

array([[ -5.39654493,  -1.03111839,  -0.66916656, ..., -22.37672997,
        -23.18770981, -21.84348297],
       [ -4.58577061,  -1.13412857,  -8.53803253, ..., -20.15686035,
        -26.83709717, -23.57501984],
       [ -5.22238874,  -1.28241444,  -0.17312431, ..., -21.328228  ,
        -19.85441208, -33.88444138],
       ...,
       [ -5.39654493, -17.29168129,  -4.36076593, ..., -20.82580185,
        -21.06583786, -22.44311523],
       [ -5.93131638, -14.24741745,  -0.26373291, ..., -26.74324799,
        -18.38433075, -22.35527802],
       [ -5.67053604,  -0.10595131,   0.        , ..., -23.33252335,
        -28.08737564, -23.87880707]])

In [15]:
targets

array([[105, 110,  32, ...,   0,   0,   0],
       [ 97, 110, 110, ...,   0,   0,   0],
       [111, 102,  32, ...,   0,   0,   0],
       ...,
       [105,  32,  97, ...,   0,   0,   0],
       [101, 100, 103, ...,   0,   0,   0],
       [121, 111, 117, ...,   0,   0,   0]], dtype=int32)

In [16]:
np.equal(targets, 0)

array([[False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]])

In [17]:
non_pad = 1.0 - np.equal(targets, 0)

In [18]:
non_pad

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [19]:
print(f'non_pad has shape: {non_pad.shape}\n')
print(f'non_pad looks like this: \n\n {non_pad}')

non_pad has shape: (32, 64)

non_pad looks like this: 

 [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]]


In [20]:
log_p.shape

(32, 64)

By computing the product of the log probabilities and the non_pad tensor you remove the effect of padding on the metric:

In [21]:
real_log_p = log_p * non_pad
print(f'real log probabilities still have shape: {real_log_p.shape}')

real log probabilities still have shape: (32, 64)


In [44]:
real_log_p

array([[ -5.39654493,  -1.03111839,  -0.66916656, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -4.58577061,  -1.13412857,  -8.53803253, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.22238874,  -1.28241444,  -0.17312431, ...,  -0.        ,
         -0.        ,  -0.        ],
       ...,
       [ -5.39654493, -17.29168129,  -4.36076593, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.93131638, -14.24741745,  -0.26373291, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.67053604,  -0.10595131,   0.        , ...,  -0.        ,
         -0.        ,  -0.        ]])

In [22]:
print(f'log probabilities before filtering padding: \n\n {log_p}\n')
print(f'log probabilities after filtering padding: \n\n {real_log_p}')

log probabilities before filtering padding: 

 [[ -5.39654493  -1.03111839  -0.66916656 ... -22.37672997 -23.18770981
  -21.84348297]
 [ -4.58577061  -1.13412857  -8.53803253 ... -20.15686035 -26.83709717
  -23.57501984]
 [ -5.22238874  -1.28241444  -0.17312431 ... -21.328228   -19.85441208
  -33.88444138]
 ...
 [ -5.39654493 -17.29168129  -4.36076593 ... -20.82580185 -21.06583786
  -22.44311523]
 [ -5.93131638 -14.24741745  -0.26373291 ... -26.74324799 -18.38433075
  -22.35527802]
 [ -5.67053604  -0.10595131   0.         ... -23.33252335 -28.08737564
  -23.87880707]]

log probabilities after filtering padding: 

 [[ -5.39654493  -1.03111839  -0.66916656 ...  -0.          -0.
   -0.        ]
 [ -4.58577061  -1.13412857  -8.53803253 ...  -0.          -0.
   -0.        ]
 [ -5.22238874  -1.28241444  -0.17312431 ...  -0.          -0.
   -0.        ]
 ...
 [ -5.39654493 -17.29168129  -4.36076593 ...  -0.          -0.
   -0.        ]
 [ -5.93131638 -14.24741745  -0.26373291 ...  -0.        

In [23]:
log_ppx = np.sum(real_log_p, axis=1) / np.sum(non_pad, axis=1)
log_ppx = np.mean(-log_ppx)
print(f'The log perplexity and perplexity of the model are respectively: {log_ppx} and {np.exp(log_ppx)}')

The log perplexity and perplexity of the model are respectively: 2.6211854987065033 and 13.752016923578548


In [25]:
predictions

array([[[-15.579997, -25.735575, -15.576893, ..., -15.574669,
         -15.571493, -15.569425],
        [-24.01082 , -35.80076 , -23.743649, ..., -23.807941,
         -23.727554, -23.804428],
        [-15.783699, -14.416848, -15.512791, ..., -15.729168,
         -15.671564, -15.53212 ],
        ...,
        [-22.37673 , -29.096514, -22.266487, ..., -22.157543,
         -22.212416, -22.285917],
        [-23.18771 , -39.62314 , -23.07188 , ..., -23.058746,
         -22.928747, -23.131004],
        [-21.843483, -26.035233, -21.877586, ..., -21.576801,
         -21.74238 , -21.694439]],

       [[-15.579997, -25.735575, -15.576893, ..., -15.574669,
         -15.571493, -15.569425],
        [-15.887024, -16.101957, -15.914328, ..., -15.740339,
         -15.764511, -15.746195],
        [-17.759518, -19.134003, -17.479977, ..., -17.778797,
         -17.484093, -17.56089 ],
        ...,
        [-20.15686 , -29.839993, -20.06406 , ..., -20.013279,
         -20.045275, -20.078325],
        [-26

In [26]:
predictions[0, 0, :]

array([-15.579997 , -25.735575 , -15.576893 , -15.575438 , -15.578    ,
       -15.5671   , -15.569961 , -15.577178 , -15.577527 , -16.853582 ,
       -15.579068 , -15.572912 , -15.583867 , -15.582075 , -15.572544 ,
       -15.590691 , -15.580892 , -15.564224 , -15.581494 , -15.575191 ,
       -15.580459 , -15.570125 , -15.570762 , -15.573637 , -15.569896 ,
       -15.563946 , -15.577015 , -15.578933 , -15.570032 , -15.569778 ,
       -15.577312 , -15.583919 , -14.783871 , -19.375536 , -15.56463  ,
       -15.583172 , -16.378674 , -15.570613 , -10.78342  ,  -4.743763 ,
        -6.0541244, -26.079489 , -15.568203 , -15.573162 , -22.60177  ,
        -8.983564 , -23.188797 , -15.576623 , -15.369985 ,  -7.5411735,
        -8.133133 ,  -8.58906  , -11.534666 , -12.234286 , -12.028591 ,
       -11.054028 , -12.349669 , -11.166466 , -30.400703 , -28.643908 ,
       -15.573648 , -15.583812 , -15.5819   , -21.409838 , -15.566822 ,
        -2.3704128,  -3.0276651,  -2.9461555,  -3.9008489,  -4.5

In [27]:
reshaped_targets[0, 0, :]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [28]:
predictions[0, 0, :] * reshaped_targets[0, 0, :]

array([-0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.  

In [29]:
test = np.zeros((32, 64))

In [37]:
for sequence_index in range(predictions.shape[0]):
    for token_index in range(predictions.shape[1]):
        test[sequence_index, token_index] = np.sum(predictions[sequence_index, token_index, :] * reshaped_targets[sequence_index, token_index, :])

In [38]:
test

array([[ -5.39654493,  -1.03111839,  -0.66916656, ..., -22.37672997,
        -23.18770981, -21.84348297],
       [ -4.58577061,  -1.13412857,  -8.53803253, ..., -20.15686035,
        -26.83709717, -23.57501984],
       [ -5.22238874,  -1.28241444,  -0.17312431, ..., -21.328228  ,
        -19.85441208, -33.88444138],
       ...,
       [ -5.39654493, -17.29168129,  -4.36076593, ..., -20.82580185,
        -21.06583786, -22.44311523],
       [ -5.93131638, -14.24741745,  -0.26373291, ..., -26.74324799,
        -18.38433075, -22.35527802],
       [ -5.67053604,  -0.10595131,   0.        , ..., -23.33252335,
        -28.08737564, -23.87880707]])

In [35]:
test = test * non_pad

In [36]:
test

array([[ -5.39654493,  -1.03111839,  -0.66916656, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -4.58577061,  -1.13412857,  -8.53803253, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.22238874,  -1.28241444,  -0.17312431, ...,  -0.        ,
         -0.        ,  -0.        ],
       ...,
       [ -5.39654493, -17.29168129,  -4.36076593, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.93131638, -14.24741745,  -0.26373291, ...,  -0.        ,
         -0.        ,  -0.        ],
       [ -5.67053604,  -0.10595131,   0.        , ...,  -0.        ,
         -0.        ,  -0.        ]])

In [39]:
targets

array([[105, 110,  32, ...,   0,   0,   0],
       [ 97, 110, 110, ...,   0,   0,   0],
       [111, 102,  32, ...,   0,   0,   0],
       ...,
       [105,  32,  97, ...,   0,   0,   0],
       [101, 100, 103, ...,   0,   0,   0],
       [121, 111, 117, ...,   0,   0,   0]], dtype=int32)

In [40]:
targets[3]

array([107, 105, 110, 103,  32,  99, 108,  97, 117, 100, 105, 117, 115,
         9, 112,  97, 114, 116,  32, 116, 104, 101, 109,  59,  32, 116,
       104, 101, 121,  32,  97, 114, 101,  32, 105, 110,  99, 101, 110,
       115, 101, 100,  46,   1,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [41]:
targets[3][50]

0

In [42]:
reshaped_targets[3, 50, :]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.