<a href="https://colab.research.google.com/github/mandarbagwe7/Hands-On-Large-Language-Models-Practice/blob/main/chapter_02/Token_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embeddings

In [2]:
from transformers import AutoModel, AutoTokenizer

In [None]:
# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

In [None]:
# Load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

In [5]:
# Tokenize the sentence
tokens = tokenizer('Hello world', return_tensors='pt')

In [6]:
# Process the tokens
output = model(**tokens)[0]
output

tensor([[[-3.4816,  0.0861, -0.1819,  ..., -0.0612, -0.3911,  0.3017],
         [ 0.1898,  0.3208, -0.2315,  ...,  0.3714,  0.2478,  0.8048],
         [ 0.2071,  0.5036, -0.0485,  ...,  1.2175, -0.2292,  0.8582],
         [-3.4278,  0.0645, -0.1427,  ...,  0.0658, -0.4367,  0.3834]]],
       grad_fn=<NativeLayerNormBackward0>)

In [7]:
output.shape

torch.Size([1, 4, 384])

In [9]:
for token in tokens['input_ids'][0]:
    print(tokenizer.decode(token))

[CLS]
Hello
 world
[SEP]


# Sentence Embeddings

In [67]:
from sentence_transformers import SentenceTransformer

In [68]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [69]:
tokens = model.encode('Best Movie Ever!!')

In [70]:
tokens.shape

(768,)

# Gensim and Nearest Neighbor Similarity

In [19]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [20]:
import gensim.downloader as api

In [22]:
# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")



In [23]:
model.most_similar([model['king']], topn=11)

[('king', 1.0000001192092896),
 ('prince', 0.8236179351806641),
 ('queen', 0.7839043140411377),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247777938843),
 ('son', 0.766719400882721),
 ('uncle', 0.7627150416374207),
 ('kingdom', 0.7542161345481873),
 ('throne', 0.7539914846420288),
 ('brother', 0.7492411136627197),
 ('ruler', 0.7434253692626953)]

# Song Embedding Model

In [24]:
import pandas as pd
from urllib import request

In [40]:
# Get the playlist dataset file
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')

In [41]:
# Parse the playlist dataset file. Skip the first two lines as
# they only contain metadata
lines = data.read().decode("utf-8").split('\n')[2:]

In [43]:
# Remove playlists with only one song
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

In [50]:
# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')

In [52]:
songs = [s.rstrip().split('\t') for s in songs_file]

In [53]:
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [54]:
print( 'Playlist #1:\n ', playlists[0], '\n')
print( 'Playlist #2:\n ', playlists[1])

Playlist #1:
  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
  ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117',

In [56]:
!pip install gensim



In [57]:
from gensim.models import Word2Vec

In [58]:
# Train our Word2Vec model
model = Word2Vec(
    playlists,
    vector_size=32,
    window=20,
    negative=50,
    min_count=1,
    workers=4
)

In [60]:
song_id = 2172
print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object


In [61]:
# Ask the model for songs similar to song #2172
model.wv.most_similar(positive=str(song_id))

[('2849', 0.9971805810928345),
 ('5586', 0.997159481048584),
 ('1954', 0.9969482421875),
 ('11477', 0.996557891368866),
 ('6624', 0.9962870478630066),
 ('9995', 0.9962759613990784),
 ('1922', 0.9960748553276062),
 ('6660', 0.995845377445221),
 ('2116', 0.9954909086227417),
 ('3079', 0.995434582233429)]

In [62]:
import numpy as np

In [63]:
def print_recommendations(song_id):
    similar_songs = np.array(
                        model.wv.most_similar(
                                positive=str(song_id),
                                topn=5
                            )
                        )[:,0]

    return  songs_df.iloc[similar_songs]

In [65]:
# Extract recommendations
print_recommendations(2172)

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2849,Run To The Hills,Iron Maiden
5586,The Last In Line,Dio
1954,The Number Of The Beast,Iron Maiden
11477,Whole Lotta Love (w\/ Chris Cornell),Santana
6624,Everybody Wants Some!!!,Van Halen
