In [None]:
# LLM libraries
from sentence_transformers import SentenceTransformer
import faiss

# Helper libraries
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
# Let's pick a popular text model
model_name = 'all-mpnet-base-v2' 

# We create the SentenceTransformer based on our model. This is the function that takes texts and produces embeddings.
emb_model = SentenceTransformer(model_name)

## Lyrics

In [None]:
data_path = "https://raw.githubusercontent.com/nuitrcs/AI_Week_RAG/refs/heads/main/data/songs.csv"
lyrics = pd.read_csv(data_path)

In [None]:
# Just one line!!
embeddings = emb_model.encode(lyrics['Lyrics'])

In [None]:
np.save('lyrics_embeddings.npy', embeddings)

## Exercises

#### Shakespeare

In [None]:
shake_file = Path('../data/shakespeare_plays.csv')
plays = pd.read_csv(shake_file, index_col=0)

full plays

In [None]:
shake_embs = emb_model.encode(plays['text'])

In [None]:
np.save('shake_plays_embeddings.npy', shake_embs)

subset of plays

In [None]:
# subset_titles = ['King Lear', 'Macbeth', 'Othello', 'Romeo and Juliet', 'Hamlet']
subset_titles_mini = ['King Lear', 'Othello']
subset_plays = plays[plays['play_name'].isin(subset_titles_mini)]['text'].to_list()
shake_subset_embs = emb_model.encode(subset_plays)

In [None]:
np.save('shake_subset_mini_plays_embeddings.npy', shake_subset_embs)

#### wikipedia

In [None]:
wiki = pd.read_csv('../data/wiki_subsample_k2500_n10k.csv', index_col=0)

In [None]:
wiki_embs = emb_model.encode(wiki['text'])

In [None]:
np.save('wiki_2500_embeddings.npy', wiki_embs)

### arxiv

In [None]:
arxiv = pd.read_csv('../data/arxiv_subsample_500.csv', index_col=0)

In [None]:
arxiv.head(3)

In [None]:
arxiv_embs_abstracts = emb_model.encode(arxiv['abstract'])

In [None]:
arxiv_embs_text = emb_model.encode(arxiv['markdown'])

In [None]:
np.save('arxiv_abstracts_embeddings.npy', arxiv_embs_abstracts)
np.save('arxiv_text_embeddings.npy', arxiv_embs_text)