# In this file I train the Word Embeddings

In [None]:
from gensim.models import Word2Vec, FastText
import pandas as pd
from gensim.utils import simple_preprocess

## Get lyrics for all Songs and for each Genre, initialize preprocessing function

In [None]:
#lyrical dataset enriched with meta data created in 'enriched_metadata.ipynb' 
songs = pd.read_csv("songs_enriched.csv", sep=",", engine="python", encoding='utf-8')

songs['genre'] = songs['genre'].astype('string')

# genre related datasets
hiphop = songs.loc[songs['genre'].str.contains('Hip Hop')]
pop = songs.loc[songs['genre'].str.contains('Pop')]
rock = songs.loc[songs['genre'].str.contains('Rock')]
country = songs.loc[songs['genre'].str.contains('Country')]

# lyrics from each genre
hiphop_lyrics = hiphop['a_lyrics'].values.tolist()
rock_lyrics = rock['a_lyrics'].values.tolist()
pop_lyrics = pop['a_lyrics'].values.tolist()
country_lyrics = country['a_lyrics'].values.tolist()

# lyrics from all songs
all_lyrics = songs['a_lyrics'].values.tolist()

# function to tokenize, remove puncutations and lower words
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(simple_preprocess(str(sentence), deacc=True))

## Execute preprocessing on the song lyrics

In [None]:
all_lyrics = list(sent_to_words(all_lyrics))
rock_lyrics = list(sent_to_words(rock_lyrics))
pop_lyrics = list(sent_to_words(pop_lyrics))
hiphop_lyrics = list(sent_to_words(hiphop_lyrics))
country_lyrics = list(sent_to_words(country_lyrics))

## Word2Vec CBOW

In [None]:
# training
w2v_cbow_all = Word2Vec(all_lyrics, min_count=5, size=300, window=5)
w2v_cbow_rock = Word2Vec(rock_lyrics, min_count=2, size=300, window=5)
w2v_cbow_pop = Word2Vec(pop_lyrics, min_count=2, size=300, window=5)
w2v_cbow_hiphop = Word2Vec(hiphop_lyrics, min_count=2, size=300, window=5)
w2v_cbow_country = Word2Vec(country_lyrics, min_count=2, size=300, window=5)

# save vectors
w2v_cbow_all.wv.save_word2vec_format('allsongs_w2v_cbow_mincount5_vlen300.txt', binary=False)
w2v_cbow_rock.wv.save_word2vec_format('rock_w2v_cbow_mincount2_vlen300.txt', binary=False)
w2v_cbow_pop.wv.save_word2vec_format('pop_w2v_cbow_mincount2_vlen300.txt', binary=False)
w2v_cbow_hiphop.wv.save_word2vec_format('hiphop_w2v_cbow_mincount2_vlen300.txt', binary=False)
w2v_cbow_country.wv.save_word2vec_format('country_w2v_cbow_mincount2_vlen300.txt', binary=False)

## Word2Vec Skip-gram

In [None]:
# training
w2v_skipg_all = Word2Vec(all_lyrics, min_count=5, size=300, window=5, sg=1)
w2v_skipg_rock = Word2Vec(rock_lyrics, min_count=2, size=300, window=5, sg=1)
w2v_skipg_pop = Word2Vec(pop_lyrics, min_count=2, size=300, window=5, sg=1)
w2v_skipg_hiphop = Word2Vec(hiphop_lyrics, min_count=2, size=300, window=5, sg=1)
w2v_skipg_country = Word2Vec(country_lyrics, min_count=2, size=300, window=5, sg=1)

# save vectors
w2v_skipg_all.wv.save_word2vec_format('allsongs_w2v_skipg_mincount5_vlen300.txt', binary=False)
w2v_skipg_rock.wv.save_word2vec_format('rock_w2v_skipg_mincount2_vlen300.txt', binary=False)
w2v_skipg_pop.wv.save_word2vec_format('pop_w2v_skipg_mincount2_vlen300.txt', binary=False)
w2v_skipg_hiphop.wv.save_word2vec_format('hiphop_w2v_skipg_mincount2_vlen300.txt', binary=False)
w2v_skipg_country.wv.save_word2vec_format('country_w2v_skipg_mincount2_vlen300.txt', binary=False)

## FastText

In [None]:
# initialize
fasttext_all = FastText(size=300, window=5, min_count=5)
fasttext_rock = FastText(size=300, window=5, min_count=2)
fasttext_pop = FastText(size=300, window=5, min_count=2)
fasttext_hiphop = FastText(size=300, window=5, min_count=2)
fasttext_country = FastText(size=300, window=5, min_count=2)
# build vocab
fasttext_all.build_vocab(sentences=all_lyrics)
fasttext_rock.build_vocab(sentences=rock_lyrics)
fasttext_pop.build_vocab(sentences=pop_lyrics)
fasttext_hiphop.build_vocab(sentences=hiphop_lyrics)
fasttext_country.build_vocab(sentences=country_lyrics)
# training
fasttext_all.train(sentences=all_lyrics, total_examples=len(all_lyrics), epochs=10)
fasttext_rock.train(sentences=rock_lyrics, total_examples=len(rock_lyrics), epochs=10)
fasttext_pop.train(sentences=pop_lyrics, total_examples=len(pop_lyrics), epochs=10)
fasttext_hiphop.train(sentences=hiphop_lyrics, total_examples=len(hiphop_lyrics), epochs=10)
fasttext_country.train(sentences=country_lyrics, total_examples=len(country_lyrics), epochs=10)
# save vectors
fasttext_all.wv.save_word2vec_format('allsongs_fasttext_cbow_mincount5_vlen300.txt', binary=False)
fasttext_rock.wv.save_word2vec_format('rock_fasttext_cbow_mincount2_vlen300.txt', binary=False)
fasttext_pop.wv.save_word2vec_format('pop_fasttext_cbow_mincount2_vlen300.txt', binary=False)
fasttext_hiphop.wv.save_word2vec_format('hiphop_fasttext_cbow_mincount2_vlen300.txt', binary=False)
fasttext_country.wv.save_word2vec_format('country_fasttext_cbow_mincount2_vlen300.txt', binary=False)