In [1]:
import os
os.chdir('/Users/alexrotondo/nlp/')

In [2]:
from stopwords import get_stopwords
from embeddings.utils.preprocessing.string_preprocessing import lower_text, remove_punctuation, tokenize_document
from embeddings.utils.document_config import DocumentConfig
from embeddings.utils.preprocessing.token_preprocessing import remove_stopwords, remove_unknown_words
from embeddings.utils.vocab_config import CorpusVocabConfig
from embeddings.utils.vocab import CorpusVocab
from embeddings.glove.glove import GLoVE

import numpy as np
import pandas as pd

import seaborn as sb
import matplotlib.pyplot as plt
sb.set_style('darkgrid')

In [3]:
million_songs = pd.read_csv('embeddings/data/spotify_millsongdata.csv')
million_songs = million_songs.sample(frac=1, replace=False) # randomize ordering
million_songs.head()

Unnamed: 0,artist,song,link,text
2464,Cheap Trick,How Are You?,/c/cheap+trick/how+are+you_20029364.html,Hello \nHow are you? \nHow'd you sleep last ...
39547,Keith Green,The Prodigal Son Suite,/k/keith+green/the+prodigal+son+suite_20077391...,"I was done hoeing, out in the fields for the d..."
13498,Natalie Cole,Starting Over Again,/n/natalie+cole/starting+over+again_20328518.html,"And, when I hold you in my arms I promise you ..."
38049,John Legend,Burning Down The House,/j/john+legend/burning+down+the+house_20809147...,Watch out you might get what you're after \nC...
50134,Rascal Flatts,Better Now,/r/rascal+flatts/better+now_20450673.html,If I had one call to make \nI would dial yest...


In [4]:
def remove_newline(text):
    return text.replace("\n", "")

million_songs.text = million_songs.text.apply(remove_newline)
million_songs.text = million_songs.text.apply(lower_text)
million_songs.text = million_songs.text.apply(remove_punctuation)
million_songs.head()

Unnamed: 0,artist,song,link,text
2464,Cheap Trick,How Are You?,/c/cheap+trick/how+are+you_20029364.html,hello how are you howd you sleep last night ...
39547,Keith Green,The Prodigal Son Suite,/k/keith+green/the+prodigal+son+suite_20077391...,i was done hoeing out in the fields for the da...
13498,Natalie Cole,Starting Over Again,/n/natalie+cole/starting+over+again_20328518.html,and when i hold you in my arms i promise you ...
38049,John Legend,Burning Down The House,/j/john+legend/burning+down+the+house_20809147...,watch out you might get what youre after cool...
50134,Rascal Flatts,Better Now,/r/rascal+flatts/better+now_20450673.html,if i had one call to make i would dial yester...


In [5]:
million_songs['tokens'] = million_songs.text.apply(tokenize_document)
million_songs['n_tokens'] = million_songs.tokens.apply(len)

In [6]:
MAX_VOCAB_WORDS = 5000
MAX_NUMBER_OF_TOKENS_PER_DOCUMENT = 256

vocab_config = CorpusVocabConfig(
    max_tokens=MAX_VOCAB_WORDS,
    randomize_token_index=False
)

document_config = DocumentConfig.create(
    lower_text=False, #already done
    exclude_punctuation=False, #already done
    remove_stopwords=False,
    excluded_punctuation=None,
    stopwords=None,
)

song_lyrics: list[str] = million_songs.text.to_list()

vocab: CorpusVocab = CorpusVocab.create(
    documents=song_lyrics,
    document_config=document_config,
    corpus_vocab_config=vocab_config
)

In [7]:
assert len(vocab.vocab) == MAX_VOCAB_WORDS + 1

In [8]:
def _remove_unknown_words(tokens: list[str], _vocab: CorpusVocab = vocab) -> list[str]:
    return remove_unknown_words(document_tokens=tokens,
                                vocab=_vocab.vocab,
                                unknown_word_identifier=_vocab.unknown_word_identifier)
million_songs['tokens_cleaned'] = million_songs.tokens.apply(_remove_unknown_words)

In [9]:
_glove = GLoVE().create(
    vocab=vocab,
    tokenized_document_list=million_songs.sample(frac=0.1, replace=False).tokens_cleaned.to_list(),
    context_window_length=3
)

In [10]:
np.log(min(_glove.co_occurrence_dictionary.values()))

-1.0986122886681098

In [11]:
np.random.rand(5)

array([0.8225439 , 0.87413876, 0.04033639, 0.47884658, 0.6870156 ])

In [15]:
embeddings = _glove.create_embeddings(
    embedding_dimension=20,
    learning_rate=0.1,
    number_of_epochs=10
)

mean loss for epoch #1 is 0.475
mean loss for epoch #2 is 0.38846
mean loss for epoch #3 is 0.37587
mean loss for epoch #4 is 0.37007
mean loss for epoch #5 is 0.36643
mean loss for epoch #6 is 0.36503
mean loss for epoch #7 is 0.36389
mean loss for epoch #8 is 0.36325
mean loss for epoch #9 is 0.36246
mean loss for epoch #10 is 0.36192
