# Transformers

In this example we implement a multiple head attension sub-layer in a transformer encoder and perform natural language processing (NLP) tasks using a transformer-based model.

Import packages

In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
from scipy.special import softmax
from torch.nn.functional import cosine_similarity

# Multiple Head Attention Sub-Layer

Initialize Input Data

In [2]:
np.random.seed(0) # Do not remove this line

d_model = 512
m_inputs = 3

x = np.random.rand(m_inputs, d_model)

print('x:', x)
print('x.shape:', x.shape)

x: [[0.5488135  0.71518937 0.60276338 ... 0.44613551 0.10462789 0.34847599]
 [0.74009753 0.68051448 0.62238443 ... 0.6204999  0.63962224 0.9485403 ]
 [0.77827617 0.84834527 0.49041991 ... 0.07382628 0.49096639 0.7175595 ]]
x.shape: (3, 512)


# Create Matrices for Query, Key, and Value

In [4]:
n_heads = 8
d_k = d_model // n_heads

# Create an empty tensor W with the correct dimensions
W = torch.empty((d_model, d_k))

# Create Query matrix
torch.manual_seed(0)
# Randomly initialize the values in the tensor
nn.init.xavier_uniform_(W)
# Copy to numpy array
W_query = W.data.numpy()
# Calculate query matrix
Q = x @ W_query

# Repeat for Key matrix
torch.manual_seed(1)
W = torch.empty((d_model, d_k))
nn.init.xavier_uniform_(W)
W_key = W.data.numpy()
K = x @ W_key

# Repeat for Value matrix
torch.manual_seed(2)
W = torch.empty((d_model, d_k))
nn.init.xavier_uniform_(W)
W_value = W.data.numpy()
V = x @ W_value

print('W_query[0,:5]:', W_query[0,:5])
print('W_query.shape:', W_query.shape)
print('Q[0, :5]:', Q[0,:5])
print('Q.shape:', Q.shape)
print('K[0,:5]', K[0,:5])
print('K.shape', K.shape)
print('V[0,:5]', V[0,:5])
print('V.shape', V.shape)

W_query[0,:5]: [-0.00076412  0.05475055 -0.0840017  -0.07511146 -0.03930965]
W_query.shape: (512, 64)
Q[0, :5]: [-0.22772415  0.48167861  1.48693408 -1.00410576  0.19323685]
Q.shape: (3, 64)
K[0,:5] [ 0.2283654  -0.65482728 -0.07202067  0.49886374  0.57045028]
K.shape (3, 64)
V[0,:5] [-0.44997754  0.92097362 -0.76932045  0.03289757 -0.49462588]
V.shape (3, 64)


# Compute Attention Scores and Weighted Output

Attention scores are calculated with the following formula:
\begin{equation}
Attention(Q, K) = softmax(\frac{Q\cdot K^T}{\sqrt{d_k}})
\end{equation}

in which $\sqrt{d_k}$ is used for normalization purposes.

In [5]:
attn_scores = (Q @ K.T) / math.sqrt(d_k)

# Normalize attention scores
attn_scores_norm = softmax(attn_scores, axis=1)

# Test
print('attn_scores.shape:', attn_scores.shape)
print('Unnormalized attn_scores:', attn_scores)
print('Normalized attn_scores:', attn_scores_norm)

attn_scores.shape: (3, 3)
Unnormalized attn_scores: [[-0.75497307 -0.97036233 -0.85112729]
 [ 0.23777018 -0.70730381 -0.37639239]
 [ 0.21608578 -0.73905372 -0.89881112]]
Normalized attn_scores: [[0.36838498 0.29700212 0.33461289]
 [0.51820328 0.20140013 0.2803966 ]
 [0.58387084 0.22464925 0.19147991]]


# Compute Weighted Output

In [6]:
weighted_output = attn_scores_norm @ V

# Test
print('weighted_output[0,:5]:', weighted_output[0,:5])
print('weighted_output.shape:', weighted_output.shape)

weighted_output[0,:5]: [-0.37040031  0.493314   -0.78595572  0.09711595 -0.33551551]
weighted_output.shape: (3, 64)


# Transformer-Based NLP Tasks

Install `transformers` package

In [7]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize Inputs

In [8]:
text = """The hotness of the sun and the coldness of the outer space are inexhaustible thermodynamic
resources for human beings. From a thermodynamic point of view, any energy conversion systems
that receive energy from the sun and/or dissipate energy to the universe are heat engines with
photons as the "working fluid" and can be analyzed using the concept of entropy. While entropy
analysis provides a particularly convenient way to understand the efficiency limits, it is typically
taught in the context of thermodynamic cycles among quasi-equilibrium states and its
generalization to solar energy conversion systems running in a continuous and non-equilibrium
fashion is not straightforward. In this educational article, we present a few examples to illustrate
how the concept of photon entropy, combined with the radiative transfer equation, can be used to
analyze the local entropy generation processes and the efficiency limits of different solar energy
conversion systems. We provide explicit calculations for the local and total entropy generation
rates for simple emitters and absorbers, as well as photovoltaic cells, which can be readily
reproduced by students. We further discuss the connection between the entropy generation and the
device efficiency, particularly the exact spectral matching condition that is shared by infinitejunction photovoltaic cells and reversible thermoelectric materials to approach their theoretical
efficiency limit."""

encoded_input = tokenizer(text, return_tensors='pt')

print(len(text.split()))
print(encoded_input['input_ids'].shape)

211
torch.Size([1, 275])


# Output Word Vectors from BERT

In [10]:
output = model(**encoded_input)

last_hidden_state = output['last_hidden_state']

print(last_hidden_state.shape)

input_ids_pt = encoded_input['input_ids']
input_ids_list = input_ids_pt.tolist()[0]
input_tokens = tokenizer.convert_ids_to_tokens(input_ids_list)

print(input_ids_list[:10])
print(input_tokens[:10])

torch.Size([1, 275, 768])
[101, 1996, 2980, 2791, 1997, 1996, 3103, 1998, 1996, 3147]
['[CLS]', 'the', 'hot', '##ness', 'of', 'the', 'sun', 'and', 'the', 'cold']


# Find Output Vectors That Correspond to "entropy"

In [11]:
vectors = []
for i, token in enumerate(input_tokens):
    if token == "entropy":
        vectors.append(last_hidden_state[0][i])

print('Number of "entropy":', len(vectors))

matches = [torch.allclose(vectors[i], vectors[i+1]) for i in range(len(vectors)-1)]
print(f'Do they have the same value? {matches}')

Number of "entropy": 6
Do they have the same value? [False, False, False, False, False]


# Obtain Sentence Vectors from BERT

In [12]:
sentences = text.replace('\n', ' ').split('.')
sentences = [s.strip() + '.' for s in sentences if len(s.strip())>0] # Some cleaning work

print(f'Resulting in {len(sentences)} sentences:')
print(sentences)

Resulting in 6 sentences:
['The hotness of the sun and the coldness of the outer space are inexhaustible thermodynamic resources for human beings.', 'From a thermodynamic point of view, any energy conversion systems that receive energy from the sun and/or dissipate energy to the universe are heat engines with photons as the "working fluid" and can be analyzed using the concept of entropy.', 'While entropy analysis provides a particularly convenient way to understand the efficiency limits, it is typically taught in the context of thermodynamic cycles among quasi-equilibrium states and its generalization to solar energy conversion systems running in a continuous and non-equilibrium fashion is not straightforward.', 'In this educational article, we present a few examples to illustrate how the concept of photon entropy, combined with the radiative transfer equation, can be used to analyze the local entropy generation processes and the efficiency limits of different solar energy conversion 

# Tokenize Example Sentences

In [13]:
encoded_sentences = tokenizer(sentences, padding=True, return_tensors='pt')

print(encoded_sentences['input_ids'].shape)
print(encoded_sentences['input_ids'][0,:])
print(encoded_sentences['input_ids'][1,:])

torch.Size([6, 57])
tensor([  101,  1996,  2980,  2791,  1997,  1996,  3103,  1998,  1996,  3147,
         2791,  1997,  1996,  6058,  2686,  2024,  1999, 10288, 13821,  3775,
         3468,  1996, 10867,  7716, 18279,  7712,  4219,  2005,  2529,  9552,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])
tensor([  101,  2013,  1037,  1996, 10867,  7716, 18279,  7712,  2391,  1997,
         3193,  1010,  2151,  2943,  7584,  3001,  2008,  4374,  2943,  2013,
         1996,  3103,  1998,  1013,  2030,  4487, 18719, 17585,  2943,  2000,
         1996,  5304,  2024,  3684,  5209,  2007, 26383,  2015,  2004,  1996,
         1000,  2551,  8331,  1000,  1998,  2064,  2022, 16578,  2478,  1996,
         4145,  1997, 23077,  1012,   102,     0,     0])


# Obtain Output Tensors for All Input Sentences

In [15]:
outputs = model(**encoded_sentences)

print(outputs['last_hidden_state'].shape)

# Note that the first dimension of model output is batch size
print(outputs['last_hidden_state'][0].shape)

torch.Size([6, 57, 768])
torch.Size([57, 768])


# Represent Meaning of Sentence Using Input Token [CLS]

In [16]:
CLS_vec = outputs['last_hidden_state'][0][0]
print(CLS_vec.shape)

torch.Size([768])


# Compute Cosine Similarity Between Sentences

Cosine similarity can be used to determine semantic similarity between statements and sentences.

In [17]:
for i in range(5):
    for j in range(i+1, 6):
        sim = cosine_similarity(outputs['last_hidden_state'][i][0], outputs['last_hidden_state'][j][0], dim=0).item()

        print(f'{i} <-> {j}: {sim}')

0 <-> 1: 0.8591639995574951
0 <-> 2: 0.7771981358528137
0 <-> 3: 0.7985227108001709
0 <-> 4: 0.7754685878753662
0 <-> 5: 0.8052164316177368
1 <-> 2: 0.876341700553894
1 <-> 3: 0.832162082195282
1 <-> 4: 0.823844850063324
1 <-> 5: 0.8492752313613892
2 <-> 3: 0.8241375684738159
2 <-> 4: 0.8598626852035522
2 <-> 5: 0.8579832315444946
3 <-> 4: 0.9018083810806274
3 <-> 5: 0.929144024848938
4 <-> 5: 0.9185266494750977


In [19]:
# Print sentences with largest cosine similarity
print(sentences[3])
print(sentences[5])

In this educational article, we present a few examples to illustrate how the concept of photon entropy, combined with the radiative transfer equation, can be used to analyze the local entropy generation processes and the efficiency limits of different solar energy conversion systems.
We further discuss the connection between the entropy generation and the device efficiency, particularly the exact spectral matching condition that is shared by infinitejunction photovoltaic cells and reversible thermoelectric materials to approach their theoretical efficiency limit.


# Summarize Text

In [20]:
from transformers import pipeline

summarizer = pipeline("summarization", device='cuda')
# summarizer = pipeline("summarization")

print(summarizer(text, max_length=150, min_length=30))

# my turn
test_text = """ GPUs, or Graphics Processing Units, are important pieces of hardware originally designed for rendering computer graphics, primarily for games and movies. However, in recent years, GPUs have gained recognition for significantly enhancing the speed of computational processes involving neural networks.

GPUs now play a pivotal role in the artificial intelligence revolution, predominantly driving rapid advancements in deep learning, computer vision, and large language models, among others. """
print()
print(summarizer(test_text, max_length=50, min_length=30))

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'summary_text': ' The hotness of the sun and the coldness of outer space are inexhaustible thermodynamic resources for human beings . From a thermodynamic point of view, any energy conversion systems that receive energy from the sun or dissipate energy to the universe are heat engines with photons as the "working fluid"'}]

[{'summary_text': ' Graphics Processing Units are important pieces of hardware originally designed for rendering computer graphics, primarily for games and movies . In recent years, they have gained recognition for significantly enhancing the speed of computational processes involving neural networks .'}]


# Perform Sentiment Analysis

In [21]:
# sentiment_classifier = pipeline("sentiment-analysis")
sentiment_classifier = pipeline("sentiment-analysis", device='cuda')
text2 = "I love using transformers library for natural language processing!"

# Perform sentiment classification
result = sentiment_classifier(text2)

# Output the result
print(result)

text3 = "I didn't like the movie. It was boring"

result = sentiment_classifier(text3)

# Output the result
print(result)

my_text = "This lab wasn't extremely difficult, but I learned a lot about how to apply pretrained models."

result = sentiment_classifier(my_text)

print(result)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9984171390533447}]
[{'label': 'NEGATIVE', 'score': 0.999295711517334}]
[{'label': 'POSITIVE', 'score': 0.9930394291877747}]
