# Supplementary Material for RAG Workshop

<span style="text-transform: uppercase;
        font-size: 14px;
        letter-spacing: 1px;
        font-family: 'Segoe UI', sans-serif;">
    Author
</span><br>
efrén cruz cortés
<hr style="border: none; height: 1px; background: linear-gradient(to right, transparent 0%, #ccc 10%, transparent 100%); margin-top: 10px;">

## Imports

In [1]:
# ML/AI libraries
from sentence_transformers import SentenceTransformer
from umap import UMAP

# 3rd party python libraries
import pandas as pd
import numpy as np

# python libraries
from pathlib import Path
import os

# visualization
import plotly.express as px
import plotly.graph_objects as go

## Visualize songs

In [2]:
# Get data
data_path = Path("data/songs.csv")
lyrics = pd.read_csv(data_path)

# Create embeddings
model_name = 'all-mpnet-base-v2'
emb_model = SentenceTransformer(model_name)
embeddings = emb_model.encode(lyrics['Lyrics'])

In [3]:
# Pick artists to compare
n_heatmap = 5
a_artist = 'Taylor Swift'
b_artist = 'Bob Dylan'

In [4]:
# Taylor's lyrics:
lyrics[lyrics['Artist']==a_artist].head(n_heatmap)

Unnamed: 0,Artist,Title,Lyrics
0,Taylor Swift,cardigan,"Vintage tee, brand new phone\nHigh heels on co..."
1,Taylor Swift,exile,"I can see you standing, honey\nWith his arms a..."
2,Taylor Swift,Lover,We could leave the Christmas lights up 'til Ja...
3,Taylor Swift,the 1,"I'm doing good, I'm on some new shit\nBeen say..."
4,Taylor Swift,Look What You Made Me Do,I don't like your little games\nDon't like you...


In [5]:
# Dylan's lyrics:
lyrics[lyrics['Artist']==b_artist].head(n_heatmap)

Unnamed: 0,Artist,Title,Lyrics
575,Bob Dylan,Murder Most Foul,"'Twas a dark day in Dallas, November '63\nA da..."
576,Bob Dylan,Blowin’ in the Wind,How many roads must a man walk down\nBefore yo...
577,Bob Dylan,The Times They Are A-Changin’,"Come gather 'round people, wherever you roam\n..."
578,Bob Dylan,All Along the Watchtower,"""There must be some way out of here""\nSaid the..."
579,Bob Dylan,Like a Rolling Stone,Once upon a time you dressed so fine\nThrew th...


In [None]:
# Let's save the indices for easy access
a_idxs = lyrics[lyrics['Artist']==a_artist].index.to_list()[:n_heatmap]
b_idxs = lyrics[lyrics['Artist']==b_artist].index.to_list()[:n_heatmap]

# subset embeddings
a_embs = embeddings[a_idxs]
b_embs = embeddings[b_idxs]
both_embs = np.concatenate((a_embs, b_embs), axis=0)

# extract titles for our visualization
a_titles = lyrics['Title'].iloc[a_idxs].to_list()
b_titles = lyrics['Title'].iloc[b_idxs].to_list()
# truncating text
a_titles = [title[:20] for title in a_titles]
b_titles = [title[:20] for title in b_titles]
both_titles = a_titles + b_titles

### Similarity Heatmap

In [None]:
# compute their similarity, we want to visualize this with a heatmap
fl_sim_matrix = emb_model.similarity(both_embs, both_embs)

In [25]:
fig = px.imshow(
    fl_sim_matrix,
    x=both_titles,
    y=both_titles,
    color_continuous_scale="Viridis",
    text_auto=".2f"
)

img_w = 950
img_h = 675
fig.update_layout(
    title=f"Cosine Similarity Among {a_artist} and {b_artist} Lyrics",
    width=img_w,
    height=img_h,
    xaxis=dict(tickangle=45),
    margin=dict(l=50, r=50, t=100, b=50)
)

names_ycols = 1.07
names_xrows = -.015
font_params = dict(size=16, color="black")
fig.add_annotation(
    x=2,
    y=names_ycols,  # above the top ticks
    text=a_artist,
    showarrow=False,
    font=font_params,
    xref="x",
    yref="paper"
)
fig.add_annotation(
    x=7,
    y=names_ycols,
    text=b_artist,
    showarrow=False,
    font=font_params,
    xref="x",
    yref="paper"
)

fig.add_annotation(
    x=names_xrows,
    y=2,
    text=a_artist,
    showarrow=False,
    font=font_params,
    textangle=-90,
    xref="paper",
    yref="y"
)
fig.add_annotation(
    x=names_xrows,
    y=7,
    text=b_artist,
    showarrow=False,
    font=font_params,
    textangle=-90,
    xref="paper",
    yref="y"
)

fig.show()

In [27]:
# save figure
fig_params = {"width":img_w, "height":img_h, "scale":2}
fig.write_image('images/taylor_dylan_heatmap.png', **fig_params)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OK, the warning above is fun. The code below addresses it but is there just for fun.

In [28]:
# save current tokenizers setting
prev_setting = os.environ.get("TOKENIZERS_PARALLELISM", None)

# disable parallelism before the fork step (in this case, saving the figure)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# save figure, this step does the forking
fig.write_image('images/taylor_dylan_heatmap.png', **fig_params)

# restore the previous setting
if prev_setting is None:
    del os.environ["TOKENIZERS_PARALLELISM"]
else:
    os.environ["TOKENIZERS_PARALLELISM"] = prev_setting

### Dimensionality reduction

In [30]:
dimred_model = UMAP(
    n_neighbors=3,  # umap hyper-parameter
    n_components=2, # dimension we are reducing to
    metric='cosine'
)

two_d_rep = dimred_model.fit_transform(embeddings)

In [37]:
fig_clustering = go.Figure()

fig_clustering.add_trace(go.Scatter(
    x=two_d_rep[:, 0],
    y=two_d_rep[:,1],
    mode='markers',
    marker=dict(size=6),
    text=lyrics['Title'],
    hoverinfo='text'
))

fc_w = 750
fc_h = 750
fig_clustering.update_layout(
    height=fc_h, width=fc_w,
    title='Low dimensional view of embedded lyrics',
)

fig_clustering.show()

In [33]:
# A reminder of the artists, which one would you like to see?
lyrics['Artist'].unique()

array(['Taylor Swift', 'Billie Eilish', 'The Beatles', 'David Bowie',
       'Billy Joel', 'Ed Sheeran', 'Eric Clapton', 'Bruce Springsteen',
       'Vance Joy', 'Lana Del Rey', 'Bryan Adams', 'Leonard Cohen',
       'Nat King Cole', 'twenty one pilots', 'Ray LaMontagne',
       'Bob Dylan', 'John Denver', 'Frank Sinatra', 'Queen', 'Elton John',
       'George Michael'], dtype=object)

In [38]:
# Let's highlight an artist's songs just for fun:
artist_highlight = 'John Denver'
artist_idxs = lyrics[lyrics['Artist']==artist_highlight].index.to_list()

fig_clustering = go.Figure()

fig_clustering.add_trace(go.Scatter(
    x=two_d_rep[:, 0],
    y=two_d_rep[:,1],
    mode='markers',
    marker=dict(size=6),
    text=lyrics['Title'],
    hoverinfo='text',
    name = 'All artists'
))

fig_clustering.add_trace(go.Scatter(
    x=two_d_rep[artist_idxs, 0],
    y=two_d_rep[artist_idxs,1],
    mode='markers',
    marker=dict(size=6, color='crimson'),
    text = lyrics['Title'].iloc[artist_idxs],
    hoverinfo = 'text',
    name = artist_highlight
))


fig_clustering.update_layout(
    height=fc_h, width=fc_w,
    title='Low dimensional view of embedded lyrics',
)

fig_clustering.show()

In [None]:
# save interactive html (hover)
fig_clustering.write_html('images/song_embeddings.html')

In [40]:
# save png
fig_clustering.write_image('images/song_embeddings.png', height=fc_h, width=fc_w, scale=3)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Shakespeare Data

Here I will create aggregate versions of the plays. One is by speech, and the other is by context.

In [69]:
# Load data
shakes_file = Path('data/shakespeare_plays.csv')
shakes_plays = pd.read_csv(shakes_file)

### Plays by speech

In [70]:
shakes_plays['speaker_change'] = (shakes_plays['character'] != shakes_plays['character'].shift()).cumsum()
shakes_plays.head()

Unnamed: 0,index,play_name,genre,character,act,scene,sentence,text,sex,speaker_change
0,0,All's Well That Ends Well,Comedy,Countess,1,1,1,"In delivering my son from me, I bury a second ...",female,1
1,1,All's Well That Ends Well,Comedy,Bertram,1,1,2,"And I in going, madam, weep o'er my father's d...",male,2
2,2,All's Well That Ends Well,Comedy,Bertram,1,1,3,"anew: but I must attend his majesty's command, to",male,2
3,3,All's Well That Ends Well,Comedy,Bertram,1,1,4,"whom I am now in ward, evermore in subjection.",male,2
4,4,All's Well That Ends Well,Comedy,Lafeu,1,1,5,"You shall find of the king a husband, madam; you,",male,3


In [71]:
cols_to_keep = ['play_name', 'genre', 'character', 'act', 'scene']
shakes_speeches = shakes_plays.groupby(['speaker_change'] + cols_to_keep, as_index=False).agg({'text': ' '.join})
shakes_speeches.head()

Unnamed: 0,speaker_change,play_name,genre,character,act,scene,text
0,1,All's Well That Ends Well,Comedy,Countess,1,1,"In delivering my son from me, I bury a second ..."
1,2,All's Well That Ends Well,Comedy,Bertram,1,1,"And I in going, madam, weep o'er my father's d..."
2,3,All's Well That Ends Well,Comedy,Lafeu,1,1,"You shall find of the king a husband, madam; y..."
3,4,All's Well That Ends Well,Comedy,Countess,1,1,What hope is there of his majesty's amendment?
4,5,All's Well That Ends Well,Comedy,Lafeu,1,1,"He hath abandoned his physicians, madam; under..."


In [128]:
save_speeches = input('save speeches df? [y/n]')
if save_speeches.lower()=='y' or save_speeches.lower()=='yes':
    shakes_speeches.to_csv('data/shakespeare_plays_by_speech.csv')

### Plays by context

To each line, we'll add a context of the previous and following $5$ lines.

In [None]:
# this chunk of code looks slightly complex bc I need to iterate over groupings to respect ...
# the limits of plays, acts and scenes in the context creation
context_window = 5
contexts_list = []
group_cols = ['play_name', 'act', 'scene']
for _, group in shakes_plays.groupby(group_cols, sort=False):
    group_lines = group['text'].tolist()  # convert to list for speed (so we don't slice dfs repeatedly)
    for local_idx, row in enumerate(group.itertuples()):
        low_c = max(0, local_idx - context_window)
        high_c = min(len(group_lines), local_idx + context_window)
        context = "\n".join(group_lines[low_c:high_c])

        row_dict = row._asdict()
        row_dict.pop('text', None)
        row_dict.pop('speaker_change', None)
        row_dict['line_context'] = context

        contexts_list.append(row_dict)

shakes_context = pd.DataFrame(contexts_list)
shakes_context = shakes_context.drop(columns=['Index'])
shakes_context = shakes_context.rename(columns={'sentence': 'focus_sentence'})


In [124]:
shakes_context.head()

Unnamed: 0,index,play_name,genre,character,act,scene,focus_sentence,sex,line_context
0,0,All's Well That Ends Well,Comedy,Countess,1,1,1,female,"In delivering my son from me, I bury a second ..."
1,1,All's Well That Ends Well,Comedy,Bertram,1,1,2,male,"In delivering my son from me, I bury a second ..."
2,2,All's Well That Ends Well,Comedy,Bertram,1,1,3,male,"In delivering my son from me, I bury a second ..."
3,3,All's Well That Ends Well,Comedy,Bertram,1,1,4,male,"In delivering my son from me, I bury a second ..."
4,4,All's Well That Ends Well,Comedy,Lafeu,1,1,5,male,"In delivering my son from me, I bury a second ..."


In [125]:
print(shakes_context['line_context'].iloc[10])

sir, a father: he that so generally is at all times
good must of necessity hold his virtue to you; whose
worthiness would stir it up where it wanted rather
than lack it where there is such abundance.
What hope is there of his majesty's amendment?
He hath abandoned his physicians, madam; under whose
practises he hath persecuted time with hope, and
finds no other advantage in the process but only the
losing of hope by time.
This young gentlewoman had a father,--O, that


In [127]:
save_context = input('save context df? [y/n]')
if save_context.lower()=='y' or save_context.lower()=='yes':
    shakes_context.to_csv('data/shakespeare_plays_by_context.csv')