# Doc2Vec Model Training

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import ast
from nltk.corpus import stopwords
from gensim.utils import tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pd.read_csv('../data/lyrics.csv')

In [3]:
df.head()

Unnamed: 0,lyrics,genre,id
0,"['I feel so unsure', 'As I take your hand and ...",pop,0
1,"[""Don't let them fool, ya"", 'Or even try to sc...",pop,1
2,"[""Baby, let's cruise, away from here"", ""Don't ...",pop,2
3,"['Know it sounds funny', ""But, I just can't st...",pop,3
4,"[""You've got that look again"", 'The one I hope...",pop,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54350 entries, 0 to 54349
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyrics  54350 non-null  object
 1   genre   54350 non-null  object
 2   id      54350 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [5]:
df['lyrics'] = list(map(lambda x: ast.literal_eval(x), df['lyrics']))

In [6]:
idxs_eval = pd.read_csv('../data/evaluation/lyrics_eval.csv')['id']
idxs_test = pd.read_csv('../data/classification/lyrics_test.csv')['id']

In [7]:
indexes2delete = list(idxs_eval) + list(idxs_test)

In [8]:
df = df.drop(indexes2delete)
df = df.reset_index()

In [9]:
verses = [verse for lyrics in df['lyrics'] for verse in lyrics]

## Tokenization and stop words removal

In [10]:
tokenized_verses = [list(tokenize(verse, lowercase=True)) for verse in verses]

In [11]:
stop_words = stopwords.words('english')
tokenized_verses = list(map(lambda x: [token for token in x if token not in stop_words], tokenized_verses))
tokenized_verses = [verse for verse in tokenized_verses if verse]


## Model Training

In [12]:
documents = [TaggedDocument(verse, [i]) for i, verse in enumerate(tokenized_verses)]

In [13]:
model = Doc2Vec(min_count=2, epochs=20, vector_size=50)

model.build_vocab(documents)

model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

model.save('../models/doc2vec.model')
