In [None]:
import altair as alt
import json as json
import numpy as np
import pandas as pd

In [None]:
alt.data_transformers.disable_max_rows()

# [BASICS] Embeddings

In [None]:
def read_data(encoder_file, decoder_file):
    with open('data/' + encoder_file) as f:
        encoder = json.load(f)
    with open('data/' + decoder_file) as f:
        decoder = json.load(f)
    return encoder, decoder

In [None]:
encoder, decoder = read_data('EncoderOutWrite.json', 'DecoderOutWrite.json')

# first drafts and trials of Embeddings
encoder_out = np.array(encoder['encoder_out'])
encoder_embedding = np.array(encoder['encoder_embedding']) 
encoder_states = np.array(encoder['encoder_states']) 

x, y = np.meshgrid(range(0, 1024), range(0, 7))
z = encoder_out[:,0,:]

source1 = pd.DataFrame({'x': x.ravel(),
                     'y': y.ravel(),
                     'z': z.ravel()})

draft_embeddings = alt.Chart(source1).mark_rect().encode(
    x='x:O',
    y='y:O',
    color='z:Q'
).properties(
    width=800,
    height=200
)
draft_embeddings

In [None]:
encoder_carrot, decoder_carrot = read_data('EncoderOutWrite_carrot.json', 'DecoderOutWrite_carrot.json')
encoder_steak, decoder_steak = read_data('EncoderOutWrite_steak.json', 'DecoderOutWrite_steak.json')
encoder_chops, decoder_chops = read_data('EncoderOutWrite_chops.json', 'DecoderOutWrite_chops.json')

In [None]:
encoder_out_carrot = np.array(encoder_carrot['encoder_out'])
# My m@@ um e@@ ats a car@@ rot
# Ma mère m@@ ange une car@@ otte
encoder_out_steak = np.array(encoder_steak['encoder_out'])
# My m@@ um e@@ ats a ste@@ ak
# Ma mère m@@ ange un ste@@ ak
encoder_out_chops = np.array(encoder_chops['encoder_out'])
# My m@@ um ch@@ ops a car@@ rot
# Ma mère cô@@ tel@@ ée une car@@ otte

In [None]:
def emb_plotter(x, y, sentence, english, tag, dim):
    '''
    Input:
        - x, y - empty arrays of the size required [np.array]
        - sentence - the values of the embeddings [np.array]
        - english - the input tokens / words [list]
        - tag - a tag / word to identify the dataset and sentence [string]
    
    Output:
        - source - the processed dataset [pd.DataFrame]
        - emb  - the resulting plot [alt.Chart]
    '''
 
    
    # Data preparation
    source = pd.DataFrame({'x': x.ravel(),
                            'name_y': np.array([english[i] for i in y.ravel()]),
                         'y': y.ravel(),
                         'Value': sentence.ravel(),
                        'name': [word for word in english]*dim,
                        'tag':tag
                        })
    
    # Plot
    emb = alt.Chart(source).mark_rect().encode(
        x=alt.X('x:O', title='First '+ str(dim) +' values of the embeddings',
                axis = alt.Axis(tickWidth=0,labelAngle=0, titleColor="white")),
        y=alt.Y('name_y:O', sort=[i for i in source.name], title = 'Word',
            axis=alt.Axis(
                    tickBand= "extent", 
                    title= "English token", 
                    titleColor="white",
                    tickWidth=0, 
                    labelAlign="right", 
            )),
        color=alt.Color('Value', scale=alt.Scale(scheme='darkgreen'), legend=None),
        tooltip=['Value']
    ).properties(
        width=400,
        height=175,
    ).interactive()

    return source, emb

In [None]:
dim = 150
x, y = np.meshgrid(range(0, dim), range(0, 5))
z = encoder_out_carrot[:8,0,:dim]

my_carrot = encoder_out_carrot[0,0,0:dim]
mum_carrot = (encoder_out_carrot[1,0,0:dim] + encoder_out_carrot[2,0,0:dim]) / 2
eats_carrot = (encoder_out_carrot[3,0,0:dim] + encoder_out_carrot[4,0,0:dim]) / 2
a_carrot = encoder_out_carrot[5,0,0:dim]
carrot_carrot = (encoder_out_carrot[6,0,0:dim] + encoder_out_carrot[7,0,0:dim])/ 2

sentence_carrot = np.vstack((my_carrot, mum_carrot, eats_carrot,a_carrot, carrot_carrot))
english_carrot = "My mum eats a carrot".split(' ')

source_carrot, plot_carrot = emb_plotter(x, y, sentence_carrot, english_carrot, 'carrot', dim)
plot_carrot

In [None]:
x, y = np.meshgrid(range(0, dim), range(0, 5))
z = encoder_out_steak[:8,0,:dim]

my_steak = encoder_out_steak[0,0,0:dim]
mum_steak = (encoder_out_steak[1,0,0:dim] + encoder_out_steak[2,0,0:dim]) / 2
eats_steak = (encoder_out_steak[3,0,0:dim] + encoder_out_steak[4,0,0:dim]) / 2
a_steak = encoder_out_steak[5,0,0:dim]
steak_steak = (encoder_out_steak[6,0,0:dim] + encoder_out_steak[7,0,0:dim])/ 2 

sentence_steak = np.vstack((my_steak, mum_steak, eats_steak,a_steak, steak_steak))
english_steak = "My mum eats a steak".split(' ')

source_steak, plot_steak = emb_plotter(x, y, sentence_steak, english_steak, 'steak', dim)
plot_steak

In [None]:
x, y = np.meshgrid(range(0, dim), range(0, 5))
z = encoder_out_carrot[:8,0,:dim]

sentence = np.vstack((my_carrot - my_steak, mum_carrot - mum_steak, eats_carrot - eats_steak, a_carrot - a_steak, carrot_carrot - steak_steak))
english = "My mum eats a (carrot-steak)".split(' ')

source_sub_steak, plot_sub_steak = emb_plotter(x,y, sentence, english, 'carrot-steak',dim)
plot_sub_steak

In [None]:
my_chops = encoder_out_chops[0,0,0:dim]
mum_chops = (encoder_out_chops[1,0,0:dim] + encoder_out_chops[2,0,0:dim]) / 2
chops_chops = (encoder_out_chops[3,0,0:dim] + encoder_out_chops[4,0,0:dim]) / 2
a_chops = encoder_out_chops[5,0,0:dim]
carrot_chops = (encoder_out_chops[6,0,0:dim] + encoder_out_chops[7,0,0:dim])/ 2

sentence_chops = np.vstack((my_chops, mum_chops, chops_chops,a_chops, carrot_chops))
english_chops = "My mum chops a carrot".split(' ')

source_chops, plot_chops = emb_plotter(x, y, sentence_chops, english_chops,'chops', dim)
plot_chops

In [None]:
sentence_sub_chops = np.vstack((my_carrot - my_chops, mum_carrot - mum_chops, eats_carrot - chops_chops, a_carrot - a_chops, carrot_carrot - carrot_chops))
english_sub_chops = "My mum (eats-chops) a carrot".split(' ')

source_sub_chops, plot_sub_chops = emb_plotter(x, y, sentence_sub_chops, english_sub_chops, 'eats-chops', dim)
plot_sub_chops

In [None]:
source = pd.concat([source_carrot, source_steak, source_chops, source_sub_steak, source_sub_chops])
source['idx'] = [i for i in range(len(source))]

input_dropdown = alt.binding_select(options=['carrot', 'steak', 'chops', 'carrot-steak', 'eats-chops'])  
selection = alt.selection_single(fields=['tag'], bind=input_dropdown, name='sentence', init={'tag':'carrot'})
color = alt.condition(selection,
                    alt.Color('z:N', legend=None),
                    alt.value('black'))


dark_embeddings = alt.Chart(source).mark_rect().encode(
    x=alt.X('x:O', title='First '+ str(dim) +' values of the embeddings',
            axis = alt.Axis(
                tickWidth=0,
                labelAngle=0, 
                titleColor="white",
                titleY=45,
            )),
    y=alt.Y('name_y:O', sort=alt.SortField('idx',order="ascending"), title = 'Word',
        axis=alt.Axis(
                tickBand= "extent", 
                title= "English word", 
                titleColor="white",
                tickWidth=0, 
                labelAlign="right", 
                offset=0,
                titleX=-75
        )),
    color=alt.Color('Value', scale=alt.Scale(scheme='darkgreen')),
    tooltip=['Value']
).properties(
    width=450,
    height=256,
).interactive(
).add_selection(
    selection
).transform_filter(
    selection
).configure(
    background='#333'
).configure_legend(
    titleColor='white',
    labelColor='white'
).configure_axisLeft(
    labelColor='white'
).configure_axis(
    labelColor='white'
)

alt.renderers.set_embed_options(
    padding={"left": 25, "right": 0, "bottom": 2, "top": 10}
)

dark_embeddings

# [PAPER] Embeddings

In [None]:
font_size = 12
title_size = 16
legend_size = 16

alt.renderers.set_embed_options(
    padding={"left": 25, "right": 0, "bottom": 2, "top": 10}
)


light_embeddings = alt.Chart(source).mark_rect().encode(
    x=alt.X('x:O', title='First '+ str(dim) +' values of the embeddings',
            axis = alt.Axis(
                tickWidth=0,
                labelAngle=0,
                titleY=45,
                labelFontSize=font_size,
                titleFontSize=title_size, 
            )),
    y=alt.Y('name_y:O', sort=alt.SortField('idx',order="ascending"), title = 'Word',
        axis=alt.Axis(
                tickBand= "extent", 
                title= "English word", 
                tickWidth=0, 
                labelAlign="right",
                offset=0,
                titleX=-85,
                labelFontSize=font_size,
                titleFontSize=title_size, 
        )),
    color=alt.Color('Value', scale=alt.Scale(scheme='darkgreen')),
    tooltip=['Value']
).properties(
    width=450,
    height=256,
).interactive(
).add_selection(
    selection
).transform_filter(
    selection
)

light_embeddings.configure_legend(
    titleFontSize=legend_size,
    orient='right'
) 


In [None]:
#embeddings.save('embeddings.html')

# [BASICS] Tokens

In [None]:
dim = 150
x, y = np.meshgrid(range(0, dim), range(0, 8))
z = encoder_out_carrot[:8,0,:dim]

my_carrot = encoder_out_carrot[0,0,0:dim]
mum1_carrot = encoder_out_carrot[1,0,0:dim]
mum2_carrot = encoder_out_carrot[2,0,0:dim]
eats1_carrot = encoder_out_carrot[3,0,0:dim]  
eats2_carrot = encoder_out_carrot[4,0,0:dim]
a_carrot = encoder_out_carrot[5,0,0:dim]
carrot1_carrot = encoder_out_carrot[6,0,0:dim] 
carrot2_carrot = encoder_out_carrot[7,0,0:dim]

sentence_carrot = np.vstack((my_carrot, mum1_carrot, mum2_carrot, eats1_carrot, eats2_carrot, a_carrot, carrot1_carrot, carrot2_carrot))
english_carrot = "My m um e ats a car rot".split(' ')

source_carrot, plot_carrot = emb_plotter(x, y, sentence_carrot, english_carrot, 'carrot', dim)
plot_carrot

In [None]:
x, y = np.meshgrid(range(0, dim), range(0, 8))
z = encoder_out_steak[:8,0,:dim]

my_steak = encoder_out_steak[0,0,0:dim]
mum1_steak = encoder_out_steak[1,0,0:dim]
mum2_steak = encoder_out_steak[2,0,0:dim]
eats1_steak = encoder_out_steak[3,0,0:dim] 
eats2_steak = encoder_out_steak[4,0,0:dim]
a_steak = encoder_out_steak[5,0,0:dim]
steak1_steak = encoder_out_steak[6,0,0:dim]  
steak2_steak = encoder_out_steak[7,0,0:dim]
               
sentence_steak = np.vstack((my_steak, mum1_steak, mum2_steak, eats1_steak, eats2_steak, a_steak, steak1_steak, steak2_steak))
english_steak = "My m um e ats a ste ak".split(' ')

source_steak, plot_steak = emb_plotter(x, y, sentence_steak, english_steak, 'steak', dim)
plot_steak

In [None]:
x, y = np.meshgrid(range(0, dim), range(0, 8))
z = encoder_out_carrot[:8,0,:dim]

sentence = np.vstack((my_carrot - my_steak, mum1_carrot - mum1_steak, mum2_carrot - mum2_steak, eats1_carrot - eats1_steak, eats2_carrot - eats2_steak, a_carrot - a_steak, carrot1_carrot - steak1_steak, carrot2_carrot - steak2_steak))
english = "My m um e ats a (car-ste) (rot-ak)".split(' ')

source_sub_steak, plot_sub_steak = emb_plotter(x, y, sentence, english, 'carrot-steak',dim)
plot_sub_steak

In [None]:
my_chops = encoder_out_chops[0,0,0:dim]
mum1_chops = encoder_out_chops[1,0,0:dim]
mum2_chops = encoder_out_chops[2,0,0:dim]
chops1_chops = encoder_out_chops[3,0,0:dim] 
chops2_chops = encoder_out_chops[4,0,0:dim]
a_chops = encoder_out_chops[5,0,0:dim]
carrot1_chops = encoder_out_chops[6,0,0:dim] 
carrot2_chops = encoder_out_chops[7,0,0:dim]

sentence_chops = np.vstack((my_chops, mum1_chops, mum2_chops, chops1_chops, chops2_chops, a_chops, carrot1_chops, carrot2_chops))
english_chops = "My m um ch ops a car rot".split(' ')

source_chops, plot_chops = emb_plotter(x, y, sentence_chops, english_chops,'chops', dim)
plot_chops

In [None]:
sentence_sub_chops = np.vstack((my_carrot - my_chops, mum1_carrot - mum1_chops, mum2_carrot - mum2_chops, eats1_carrot - chops1_chops, eats2_carrot - chops2_chops, a_carrot - a_chops, carrot1_carrot - carrot1_chops, carrot2_carrot - carrot2_chops))
english_sub_chops = "My m um (e-ch) (ats-ops) a car rot".split(' ')

source_sub_chops, plot_sub_chops = emb_plotter(x, y, sentence_sub_chops, english_sub_chops, 'eats-chops', dim)
plot_sub_chops

In [None]:
input_dropdown = alt.binding_select(options=['carrot', 'steak', 'chops', 'carrot-steak', 'eats-chops'])  
selection = alt.selection_single(fields=['tag'], bind=input_dropdown, name='sentence', init={'tag':'carrot'})
color = alt.condition(selection,
                    alt.Color('z:N', legend=None),
                    alt.value('black'))



source = pd.concat([source_carrot, source_steak, source_chops, source_sub_steak, source_sub_chops])
source['idx'] = [i for i in range(len(source))]

tokens = alt.Chart(source).mark_rect().encode(
    x=alt.X('x:O', title='First '+ str(dim) +' values of the embeddings',
            axis = alt.Axis(
                tickWidth=0,
                labelAngle=0, 
                titleColor="white",
                titleY=45,
            )),
    y=alt.Y('name_y:O', sort=alt.SortField('idx',order="ascending"), title = 'Word',
        axis=alt.Axis(
                tickBand= "extent", 
                title= "English token", 
                titleColor="white",
                tickWidth=0, 
                labelAlign="right", 
                offset=0,
                titleX=-60,
        )),
    color=alt.Color('Value', scale=alt.Scale(scheme='darkgreen')),
    tooltip=['Value']
).properties(
    width=450,
    height=256,
).interactive(
).add_selection(
    selection
).transform_filter(
    selection
).configure(
    background='#333'
).configure_legend(
    titleColor='white',
    labelColor='white'
).configure_axisLeft(
    labelColor='white'
).configure_axis(
    labelColor='white'
)

alt.renderers.set_embed_options(
    padding={"left": 10, "right": 0, "bottom": 2, "top": 10}
)

tokens

In [None]:
#tokens.save('tokens.html')

# [BASICS] Positional Encoding

In [None]:
def sinus_df(freq, length):
    pos = pd.Series(np.arange(length))
    value = pd.Series(np.sin(pos/freq))
    name = pd.Series(['sin(x/'+str(freq) + ')']*length)
    return pd.DataFrame({'pos':pos, 'value': value, 'name': name})

In [None]:
sin5 = sinus_df(5, 100)
sin10 = sinus_df(10, 100)
sin15 = sinus_df(15, 100)
sin20 = sinus_df(20, 100)

sinuses = pd.concat([sin5, sin10, sin15, sin20])

## Non-Interactive

In [None]:
selection = alt.selection_multi(fields=['name'], bind='legend')

sinus = alt.Chart(sinuses).mark_line(
).encode(
    x=alt.X('pos:Q', axis=alt.Axis(title='x', grid=False, titleColor='white')),
    y=alt.Y('value:Q', axis=alt.Axis(title='f(x)', grid=False, titleColor='white')),
    color=alt.Color('name:N', scale=alt.Scale(scheme='reds'), legend=alt.Legend(
        labelColor='white',
        titleColor='white',
        title='function')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
).configure_legend(
    orient='bottom',
).configure(
    background='#333'
).configure_axisLeft(
    labelColor='white'
).configure_axis(
    labelColor='white'
).configure_view(
    strokeOpacity=0
).add_selection(
    selection
)
sinus

In [None]:
#sinus.save('sinus.html')

## Interactive

In [None]:
# Interactive selectors
nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=['pos'], empty='none')
selection = alt.selection_multi(fields=['name'], bind='legend')


# Original sinuses plot
line = alt.Chart(sinuses).mark_line(
).encode(
    x=alt.X('pos:Q', axis=alt.Axis(title='x', grid=False, titleColor='white')),
    y=alt.Y('value:Q', axis=alt.Axis(title='f(x)', grid=False, titleColor='white')),
    color=alt.Color('name:N', scale=alt.Scale(scheme='reds'), legend=alt.Legend(
        labelColor='white',
        titleColor='white',
        title='function')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
)

# Transparent selectors across the chart
selectors = alt.Chart(sinuses).mark_point().encode(
    x='pos:Q',
    opacity=alt.value(0),
).add_selection(
    nearest
).transform_filter(
    selection
)

# Draw points on the line
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).transform_filter(
    selection
)

# Draw text labels near the points
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'value:Q', alt.value(' '))
).transform_filter(
    selection
)


# Join & setup the style
sinus_interactive = alt.layer(
    line, selectors, points, text
).configure_legend(
    orient='top',
).configure(
    background='#333'
).configure_axisLeft(
    labelColor='white'
).configure_axis(
    labelColor='white'
).configure_view(
    strokeOpacity=0
).add_selection(
    selection
)

sinus_interactive

In [None]:
#sinus_interactive.save('sinus_interactive.html')

# [ARCHITECTURE] Positional Encoding

In [None]:
embedding_dog, _ = read_data('dog_embedding.json', 'dog_embedding.json')

In [None]:
z = np.array(embedding_dog['fourth']).T

In [None]:
# Alternation of sine and cosine values
aux = np.zeros((1024, 1026))
for i in range(0,512,2):
    aux[i] = z[i,]
    aux[i+1] = z[512+i,]
    
# Data preparation
x, y = np.meshgrid(range(0, 513), range(0, 1024)) # 1024 embedding dim

source1= pd.DataFrame({'x': x.ravel()[:513*1024],
                     'y': y.ravel()[:513*1024],
                     'z': aux.ravel()[:513*1024]})

alt.Chart(source1).mark_rect().encode(
    x=alt.X('x:O', axis=alt.Axis(
        title='token position (pos)',
    )),
    y=alt.Y('y:O', axis=alt.Axis(
        title='i-th position of the embedding',
    )),
    color=alt.Color('z', scale=alt.Scale(scheme='reds'), legend=alt.Legend(title="PE")),
).properties(
    width=500,
    height=500
).configure_legend(
    orient='bottom',
).configure(
    background='transparent'
)

# [DETAILS] Positional Encoding

In [None]:
x, y = np.meshgrid(range(0, 1026), range(0, 512))
z = np.array(embedding_dog['fourth']).T

source1= pd.DataFrame({'x': x.ravel(),
                     'y': y.ravel(),
                     'cos': z.ravel()[:513*1024],
                      'sin': z.ravel()[513*1024:]})

sine = alt.Chart(source1, title='Sine function').mark_rect().encode(
    x=alt.X('x:O', axis=alt.Axis(
        title='token position (pos)',
        titleColor="white", 
        ticks=False)),
    y=alt.Y('y:O', axis=alt.Axis(
        title='i-th position of the embedding',
        titleColor="white", 
        ticks=False)),
    color=alt.Color('sin', scale=alt.Scale(scheme='reds'), 
                    legend=alt.Legend(
                        title="PE", 
                        labelColor="white", 
                        titleColor="white")),
).properties(
    width=500,
    height=350,
)
cosine = alt.Chart(source1, title='Cosine function').mark_rect().encode(
    x=alt.X('x:O', axis=alt.Axis(
        title='token position (pos)',
        titleColor="white", 
        ticks=False)),
    y=alt.Y('y:O', axis=alt.Axis(
        title='i-th position of the embedding',
        titleColor="white", 
        ticks=False)),
    color=alt.Color('cos', scale=alt.Scale(scheme='reds'), 
                    legend=alt.Legend(
                        title="PE", 
                        labelColor="white", 
                        titleColor="white")),
).properties(
    width=500,
    height=350,
)

(sine | cosine).configure(
    background='transparent',
    title={"color":"white", "font":"Raleway, sans-serif"}
).configure_axisLeft(
    labelColor='white'
).configure_axis(
    labelColor='white'
)

# [PAPER] Positional Encoding

In [None]:
# Data preparation
x, y = np.meshgrid(range(0, 513), range(0, 1024)) # 1024 embedding dimension
source1= pd.DataFrame({'x': x.ravel()[:513*1024],
                     'y': y.ravel()[:513*1024],
                     'z': aux.ravel()[:513*1024]})

# Plot
alt.Chart(source1).mark_rect().encode(
    x=alt.X('x:O', axis=alt.Axis(
        title='token position (pos)',
        labelFontSize=font_size,
        titleFontSize=title_size)),
    y=alt.Y('y:O', axis=alt.Axis(
        title='i-th position of the embedding',
        labelFontSize=font_size,
        titleFontSize=title_size)),
    color=alt.Color('z', scale=alt.Scale(scheme='reds'), 
                    legend=alt.Legend(
                        title="PE")),
).properties(
    width=500,
    height=500
).configure(
    background='transparent'
).configure_legend(
    titleFontSize=legend_size,
    orient='right'
) 