### Imports and configs

In [1]:
!module load anaconda3
!conda activate transformer
!pip install -r requirements.txt

/bin/bash: conda: command not found
/bin/bash: pip: command not found


In [2]:
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import torch
import numpy as np
from torchtext import datasets


ModuleNotFoundError: No module named 'matplotlib'

### plot histogram of data distribution

In [None]:
preprocd_data = torch.load("../artifacts/saved_data/preprocd_data.pt").numpy()
sent_pairs, languages, max_len = preprocd_data.shape
aggregated_data = preprocd_data.reshape([sent_pairs * languages, max_len])

In [None]:
def plot_token_hist(data):

    # Count occurrences of values equal to 2 and not equal to 2
    count_equal_to_2 = np.sum(data == 2)
    count_not_equal_to_2 = np.sum(data != 2)

    # Plot histogram with two bars
    plt.bar(['Token=2', 'Token!=2'], [count_equal_to_2, count_not_equal_to_2], color=['blue', 'orange'])
    plt.xlabel('Entry')
    plt.ylabel('Count')
    plt.title('Histogram of Token=2 and Token!=2')
    plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
    plt.show()

In [None]:
plot_token_hist(aggregated_data)

In [None]:
nonpad_token_count = np.sum(preprocd_data != 2)
pad_token_count = np.sum(preprocd_data == 2)
total_token_count = pad_token_count + nonpad_token_count
percent_nonpad = nonpad_token_count / total_token_count * 100
print(f"nonpad_token_count: {nonpad_token_count}\n"
      f"pad_token_count: {pad_token_count}\n"
      f"total_token_count: {total_token_count}\n"
      f"percent_nonpad: {percent_nonpad:2f} %\n")

In [None]:
nonpad_sums = np.sum(aggregated_data != 2, axis=1)

In [None]:
longest_sentence = nonpad_sums.max()
shortest_sentence = nonpad_sums.min()
avg_sentence = np.mean(nonpad_sums)
print(f"longest_sentence: {longest_sentence} tokens\n"
      f"shortest_sentence: {shortest_sentence} tokens\n"
      f"avg_sentence: {avg_sentence:2f} tokens\n"
      )

### histogram of sentence lengths

In [None]:
def plot_sentence_lens(data, title):
    plt.imshow(data != 2, aspect='auto')
    plt.xlabel("Sentence lengths")
    plt.ylabel("Sentence number")
    plt.title(title)
    plt.grid(True)
    plt.show()

In [None]:
plot_sentence_lens(preprocd_data[:10000, 0, :], "German sentences")
plot_sentence_lens(preprocd_data[:10000, 1, :], "English sentences")

##### histogram of sentence lengths

In [None]:
nonpad_token_counts = np.count_nonzero(preprocd_data != 2, axis=2)

In [None]:
ger_nonpad_token_counts = nonpad_token_counts[:,0]
eng_nonpad_token_counts = nonpad_token_counts[:,1]

In [None]:
ger_nonpad_token_counts

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
# We can set the number of bins with the *bins* keyword argument.
counts, bins, _ = axs[0].hist(ger_nonpad_token_counts, bins=range(50))
print(f'counts: {counts}')
axs[1].hist(eng_nonpad_token_counts, bins=range(50))
axs[0].set_title('German non-pad token counts')
axs[1].set_title('English non-pad token counts')
axs[0].set_xlabel('Non-pad token counts')
axs[0].set_ylabel(f'Number of sentences (total {len(ger_nonpad_token_counts)})')
axs[1].set_xlabel('Non-pad token counts')
axs[1].set_ylabel(f'Number of sentences (total {len(eng_nonpad_token_counts)})')
plt.plot()

In [None]:
training_bleu_N1 = []
validation_bleu_N1 = []
with open('training_history_N1.txt') as f:
    lines = f.readlines()
    for line in lines:
        if "Epoch:" in line:
            parts = line.split("|")
            tbleu = float(parts[1].split("BLEU: ")[-1].strip())
            vbleu = float(parts[2].split("BLEU: ")[-1].strip())
            training_bleu_N1.append(tbleu)
            validation_bleu_N1.append(vbleu)


In [None]:
training_bleu_N1

In [None]:
plt.figure()
plt.plot(training_bleu_N1, label='training_bleu_N1')
plt.plot(validation_bleu_N1, label='validation_bleu_N1')
plt.legend()

In [None]:
training_bleu_N6 = []
validation_bleu_N6 = []
with open('training_history_N6.txt') as f:
    lines = f.readlines()
    for line in lines:
        if "Epoch:" in line:
            parts = line.split("|")
            tbleu = float(parts[1].split("BLEU: ")[-1].strip())
            vbleu = float(parts[2].split("BLEU: ")[-1].strip())
            training_bleu_N6.append(tbleu)
            validation_bleu_N6.append(vbleu)

In [None]:
plt.figure()
plt.plot(training_bleu_N6, label='training_bleu_N6')
plt.plot(validation_bleu_N6, label='validation_bleu_N6')
plt.legend()

In [None]:
x = datasets.IWSLT2016()

In [None]:
x

In [None]:
from datasets import load_dataset

In [None]:
d = load_dataset('wmt14', 'de-en')

In [None]:
type(d['test']['translation'])

In [None]:
for x in d['test']['translation']:
    print(x)
    break

In [None]:
t,v,te = d['train'], d['validation'], d['test']

In [None]:
from torch.utils.data import DataLoader

In [None]:
dl = DataLoader(t)

In [None]:
for x in dl:
    print(x)
    break

In [None]:
t

In [None]:
rd = [tuple(sentence_pair.values()) for sentence_pair in t['translation'] + v['translation'] + te['translation']]

In [None]:
rd[:2]

In [None]:
for elem in rd:
    print(elem)
    break

In [None]:
from torchtext.data.functional import to_map_style_dataset

In [None]:
for sentence_pair in rd:
    print(sentence_pair)
    src_sentence, tgt_sentence = sentence_pair.values()
    print(src_sentence)
    print(tgt_sentence)
    break

In [None]:
data_map = to_map_style_dataset(rd)

In [None]:
for elem in data_map:
    print(elem)
    break

### image grid

In [16]:
!python3 -m pip install -U pip
!python3 -m pip install -U matplotlib

[33mCache entry deserialization failed, entry ignored[0m
Collecting pip
  Downloading https://files.pythonhosted.org/packages/a4/6d/6463d49a933f547439d6b5b98b46af8742cc03ae83543e4d7688c2420f8b/pip-21.3.1-py3-none-any.whl (1.7MB)
[K    100% |████████████████████████████████| 1.7MB 931kB/s  eta 0:00:01
[?25hInstalling collected packages: pip
[31mException:
Traceback (most recent call last):
  File "/usr/lib/python3.6/site-packages/pip/basecommand.py", line 215, in main
    status = self.run(options, args)
  File "/usr/lib/python3.6/site-packages/pip/commands/install.py", line 365, in run
    strip_file_prefix=options.strip_file_prefix,
  File "/usr/lib/python3.6/site-packages/pip/req/req_set.py", line 789, in install
    **kwargs
  File "/usr/lib/python3.6/site-packages/pip/req/req_install.py", line 854, in install
    strip_file_prefix=strip_file_prefix
  File "/usr/lib/python3.6/site-packages/pip/req/req_install.py", line 1069, in move_wheel_files
    strip_file_prefix=strip_file_

In [13]:
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1 import ImageGrid

base_path = "mlruns/726058913638390818/"
tail = "artifacts/loss_curves/N1/bleu_scores.png"

ModuleNotFoundError: No module named 'matplotlib'

In [8]:
path_list = ["3b668ba089684d4fbf0f4ba0218fc279",
             "4e2e4ffa41d740ac81d60dd92a5cd164",
             "05a55969bf2f4029909d6af976276454",
             "6b2e8f5822d94a4f9d7909c5a5bdee10",
             "6dd43ca3e3e342e689358d9570a35f27",
             "6e1978659b894671942ac83b286296dc",
             "7b82a2307dea41baa2a163eb6db115d7",
            #  "8e10ac6a03b44acf899bedf508859dd3",
             "016be894ef7640249f02431bb87873af",
             "46c192e0fade468a9dedbbe9dca8cda6",
             "75b42a8a00bf4fbda738a472a5d33e00",
             "0163f06d0e6c4d93bfbab8407413ee84",
             "0661b6e66b7846d295aadfbce2495d19",
             "75b42a8a00bf4fbda738a472a5d33e00",
            #  "",
            #  "",
            #  "",
            #  "",
            #  "",
            #  "",
             ]

In [9]:
im1 = plt.imread(f"{base_path}/{path_list[0]}/{tail}")
im2 = plt.imread(f"{base_path}/{path_list[1]}/{tail}")
im3 = plt.imread(f"{base_path}/{path_list[2]}/{tail}")
im4 = plt.imread(f"{base_path}/{path_list[3]}/{tail}")

fig = plt.figure(figsize=(4., 4.))
grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(2, 2),  # creates 2x2 grid of axes
                 axes_pad=0.1,  # pad between axes in inch.
                 )

for ax, im in zip(grid, [im1, im2, im3, im4]):
    # Iterating over the grid returns the Axes.
    ax.imshow(im)

plt.show()


NameError: name 'plt' is not defined