# Doc2Vec Model Training

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random
import ast
from gensim.utils import tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pd.read_csv('../data/lyrics.csv')

In [3]:
df.head()

Unnamed: 0,lyrics,genre,id
0,"['I feel so unsure', 'As I take your hand and ...",pop,0
1,"[""Don't let them fool, ya"", 'Or even try to sc...",pop,1
2,"[""Baby, let's cruise, away from here"", ""Don't ...",pop,2
3,"['Know it sounds funny', ""But, I just can't st...",pop,3
4,"[""You've got that look again"", 'The one I hope...",pop,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55218 entries, 0 to 55217
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyrics  55218 non-null  object
 1   genre   55218 non-null  object
 2   id      55218 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [5]:
df['lyrics'] = list(map(lambda x: ast.literal_eval(x), df['lyrics']))

In [6]:
df_eval = pd.read_csv('../data/evaluation/lyrics_eval.csv')

In [7]:
indexes2delete = df_eval['id']

In [8]:
df.drop(indexes2delete, inplace=True)

## Tokenization

In [9]:
tokenized_verses = [list(tokenize(verse, lowercase=True)) for lyrics in tqdm(df['lyrics']) for verse in lyrics]

100%|██████████| 55068/55068 [00:29<00:00, 1855.69it/s]


In [11]:
eg_verse = random.choice(tokenized_verses)
eg_verse

['who',
 'do',
 'you',
 'love',
 'is',
 'it',
 'me',
 'baby',
 'is',
 'it',
 'him',
 'now',
 'i',
 'don',
 't',
 'know']

## Doc2Vec

In [12]:
documents = [TaggedDocument(verse, [i]) for i, verse in tqdm(enumerate(tokenized_verses))]

2758742it [00:14, 193666.89it/s]


In [13]:
model = Doc2Vec(min_count=2, epochs=20)

model.build_vocab(documents)

model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

model.save('../models/doc2vec.model')


In [14]:
model = Doc2Vec(min_count=2, vector_size=1, epochs=20)

model.build_vocab(documents)

model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

model.save('../models/doc2vec1.model')
