# Lyrics based Word Embeddings using Doc2vec & Word2Vec on MusicXmatch Dataset

### Importing all the Data Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
%matplotlib inline

In [3]:
from collections import Counter
import os    
import re

### Importing NLP Libraries

In [4]:
import nltk

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehulmadaan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehulmadaan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

{'very', 'most', 'o', "you've", 'them', 'itself', 'mustn', 'd', 'such', 're', 'ain', 'our', 'before', 'yourselves', 't', 'to', 'we', 'should', "it's", 'whom', 'i', "wouldn't", 'mightn', 'aren', "haven't", 'doesn', 'again', 'about', 'can', 'll', 'all', 'she', 'but', 'they', 'weren', 'he', 'wasn', "needn't", 'those', 'against', 'out', 'shouldn', "weren't", 'myself', 'too', 'wouldn', "she's", 'him', 'you', 'hadn', 'few', 'hers', 'yours', 'a', 'above', 'y', 'ours', 'from', 'himself', 'further', 'until', 'just', 'being', 'of', "won't", 'have', "you're", 'down', 'between', 'only', 'this', 'couldn', 'both', 've', 'their', 'after', 'not', 'didn', 'isn', 'below', "didn't", 'these', 'has', 'his', 'up', 'her', 'it', 'what', 'was', 'under', 'haven', 'me', "shouldn't", 'been', 'herself', 'will', 'more', 'there', "you'll", 'in', 'hasn', 'some', 'does', 'nor', "couldn't", 'if', 'who', 'having', 'your', 'here', "doesn't", 'that', "mightn't", 'needn', 'other', 'on', 'off', 'now', "aren't", 'into', "sho

In [8]:
remove=set(stopwords.words('english'))
REMOVE_STOPWORDS = True

In [24]:
from nltk.tokenize import word_tokenize

### Reading Data from file

In [10]:
#Opening file in Reading mode
with open('/Users/mehulmadaan/Downloads/ProjectDoc2vec/mxm_dataset_train.txt','r') as data:
    lines=data.readlines()  #Reading data line by line
    words = lines[17].replace('%','').split(',')   #The first 17 lines of the dataset are usage info
    all_songs_dict = dict()
    for i,l in list(enumerate(lines))[18:]:        #Adding a counter to the iterable words
        song_info = l.split(',')
        MSDID = song_info[0]                       
        song_bow = [x.split(':') for x in song_info[2:]]
        song_dict = {}
        for word, word_count in song_bow:
            song_dict[int(word)] = int(word_count.replace('\n',''))
        word_lists = [[words[word-1]]*song_dict[word] for word in song_dict.keys()]
        song = [word for word_list in word_lists for word in word_list]
        if REMOVE_STOPWORDS:
            song = [w for w in song if w not in remove]
        all_songs_dict[str(MSDID)] = ' '.join(song).replace('\n','')

#### Over Here, 
#### 'TRZZZYX128F92D32C6' is the Track ID 
#### ' 681124' is the Track ID from MusicXmatch Dataset
##### '1:4', where 1 is the Word Index and 4 is the Word count

In [15]:
l 

'TRZZZYX128F92D32C6,681124,1:4,2:18,4:3,5:6,6:9,7:1,8:5,9:1,10:12,11:2,12:9,13:5,15:4,17:2,18:4,19:4,21:2,22:3,25:3,29:2,30:3,32:5,34:3,35:2,37:2,39:3,43:2,44:7,45:2,49:2,50:2,52:8,54:2,56:1,59:1,63:10,64:1,66:1,71:2,72:1,78:1,79:1,84:2,91:1,92:2,95:1,96:1,103:1,110:1,115:1,127:1,134:4,135:2,136:3,137:1,138:2,140:1,146:2,150:1,152:1,192:1,206:1,207:1,222:3,239:1,248:2,258:1,270:1,274:1,283:1,294:2,300:1,305:1,318:1,337:1,338:1,346:1,347:1,349:1,371:1,398:1,406:1,445:1,451:1,459:1,478:1,487:1,492:1,502:3,516:1,539:1,548:2,549:1,553:1,592:2,617:1,633:1,666:1,681:1,727:1,774:1,775:7,789:1,811:1,844:1,915:1,942:1,950:1,979:1,1008:1,1040:1,1080:9,1142:1,1232:1,1366:1,1409:1,1412:1,1537:1,1545:1,1597:1,1841:2,1861:1,1901:1,1907:1,2063:1,2097:1,2167:1,2198:1,2221:1,2468:1,2498:1,2595:1,2698:1,2975:1,2990:1,2996:1,3256:1,3267:1,3316:2,3355:1,4198:1,4356:1,4738:1,4845:1\n'

In [16]:
song_info           #Information of the First Song

['TRZZZYX128F92D32C6',
 '681124',
 '1:4',
 '2:18',
 '4:3',
 '5:6',
 '6:9',
 '7:1',
 '8:5',
 '9:1',
 '10:12',
 '11:2',
 '12:9',
 '13:5',
 '15:4',
 '17:2',
 '18:4',
 '19:4',
 '21:2',
 '22:3',
 '25:3',
 '29:2',
 '30:3',
 '32:5',
 '34:3',
 '35:2',
 '37:2',
 '39:3',
 '43:2',
 '44:7',
 '45:2',
 '49:2',
 '50:2',
 '52:8',
 '54:2',
 '56:1',
 '59:1',
 '63:10',
 '64:1',
 '66:1',
 '71:2',
 '72:1',
 '78:1',
 '79:1',
 '84:2',
 '91:1',
 '92:2',
 '95:1',
 '96:1',
 '103:1',
 '110:1',
 '115:1',
 '127:1',
 '134:4',
 '135:2',
 '136:3',
 '137:1',
 '138:2',
 '140:1',
 '146:2',
 '150:1',
 '152:1',
 '192:1',
 '206:1',
 '207:1',
 '222:3',
 '239:1',
 '248:2',
 '258:1',
 '270:1',
 '274:1',
 '283:1',
 '294:2',
 '300:1',
 '305:1',
 '318:1',
 '337:1',
 '338:1',
 '346:1',
 '347:1',
 '349:1',
 '371:1',
 '398:1',
 '406:1',
 '445:1',
 '451:1',
 '459:1',
 '478:1',
 '487:1',
 '492:1',
 '502:3',
 '516:1',
 '539:1',
 '548:2',
 '549:1',
 '553:1',
 '592:2',
 '617:1',
 '633:1',
 '666:1',
 '681:1',
 '727:1',
 '774:1',
 '775:7'

In [17]:
MSDID             # Track ID 

'TRZZZYX128F92D32C6'

In [18]:
song_bow         # List of Word Index along with Word Count

[['1', '4'],
 ['2', '18'],
 ['4', '3'],
 ['5', '6'],
 ['6', '9'],
 ['7', '1'],
 ['8', '5'],
 ['9', '1'],
 ['10', '12'],
 ['11', '2'],
 ['12', '9'],
 ['13', '5'],
 ['15', '4'],
 ['17', '2'],
 ['18', '4'],
 ['19', '4'],
 ['21', '2'],
 ['22', '3'],
 ['25', '3'],
 ['29', '2'],
 ['30', '3'],
 ['32', '5'],
 ['34', '3'],
 ['35', '2'],
 ['37', '2'],
 ['39', '3'],
 ['43', '2'],
 ['44', '7'],
 ['45', '2'],
 ['49', '2'],
 ['50', '2'],
 ['52', '8'],
 ['54', '2'],
 ['56', '1'],
 ['59', '1'],
 ['63', '10'],
 ['64', '1'],
 ['66', '1'],
 ['71', '2'],
 ['72', '1'],
 ['78', '1'],
 ['79', '1'],
 ['84', '2'],
 ['91', '1'],
 ['92', '2'],
 ['95', '1'],
 ['96', '1'],
 ['103', '1'],
 ['110', '1'],
 ['115', '1'],
 ['127', '1'],
 ['134', '4'],
 ['135', '2'],
 ['136', '3'],
 ['137', '1'],
 ['138', '2'],
 ['140', '1'],
 ['146', '2'],
 ['150', '1'],
 ['152', '1'],
 ['192', '1'],
 ['206', '1'],
 ['207', '1'],
 ['222', '3'],
 ['239', '1'],
 ['248', '2'],
 ['258', '1'],
 ['270', '1'],
 ['274', '1'],
 ['283', '1'],
 [

In [25]:
song_msd_ids = list(all_songs_dict.keys())              #Track ID of All Songs
song_msd_ids

['TRAAAAV128F421A322',
 'TRAAABD128F429CF47',
 'TRAAAED128E0783FAB',
 'TRAAAEF128F4273421',
 'TRAAAEW128F42930C0',
 'TRAAAFD128F92F423A',
 'TRAAAGF12903CEC202',
 'TRAAAHJ128F931194C',
 'TRAAAHZ128E0799171',
 'TRAAAJG128F9308A25',
 'TRAAAOF128F429C156',
 'TRAAARJ128F9320760',
 'TRAAAUC128F428716F',
 'TRAAAZF12903CCCF6B',
 'TRAABEV12903CC53A4',
 'TRAABHB12903CAFC2F',
 'TRAABHC128F933A3F8',
 'TRAABIG128F9356C56',
 'TRAABJS128F9325C99',
 'TRAABJV128F1460C49',
 'TRAABLR128F423B7E3',
 'TRAABOA128F933684A',
 'TRAABOG128F42955B1',
 'TRAABPG128F14774DD',
 'TRAABVM128F92CA9DC',
 'TRAABXH128F42955D6',
 'TRAACER128F4290F96',
 'TRAACFV128F935E50B',
 'TRAACHN128F1489601',
 'TRAACIE128F428495B',
 'TRAACIR128F42963AC',
 'TRAACJC128F934ABB5',
 'TRAACPH12903CF5F14',
 'TRAACQW128F428854F',
 'TRAACRY12903CAF2C2',
 'TRAACUP128E0789C69',
 'TRAACZN128F93236B1',
 'TRAADAA128F92F7043',
 'TRAADBN128F932D00A',
 'TRAADCQ128F93436C3',
 'TRAADKA12903CD2511',
 'TRAADKW128E079503A',
 'TRAADKZ128F149BDFF',
 'TRAADLH12

In [26]:
all_song_meta_dict = dict()

In [27]:
with open('/Users/mehulmadaan/Downloads/ProjectDoc2vec/mxm_779k_matches.txt','r') as data:
    lines = data.readlines()
    for i in range(18, len(lines)):             #FROM LINE 18 TO END OF FILE
        line = lines[i].split('<SEP>')
        MSDID = line[0]
        artist = line[1]
        title = line[2]
        #Makes a Nested Dictionary for Track ID, which Contains artist and title name
        all_song_meta_dict[str(MSDID)] = {'artist': artist, 'title': title}

In [29]:
all_song_meta_dict         

{'TRMMMKD128F425225D': {'artist': 'Karkkiautomaatti', 'title': 'Tanssi vaan'},
 'TRMMMRX128F93187D9': {'artist': 'Hudson Mohawke',
  'title': 'No One Could Ever'},
 'TRMMMCH128F425532C': {'artist': 'Yerba Brava', 'title': 'Si Vos Querés'},
 'TRMMMXN128F42936A5': {'artist': 'David Montgomery',
  'title': 'Symphony No. 1 G minor "Sinfonie Serieuse"/Allegro con energia'},
 'TRMMMBB12903CB7D21': {'artist': 'Kris Kross', 'title': "2 Da Beat Ch'yall"},
 'TRMMMHY12903CB53F1': {'artist': 'Joseph Locke', 'title': 'Goodbye'},
 'TRMMMNS128F93548E1': {'artist': "3 Gars Su'l Sofa",
  'title': "L'antarctique"},
 'TRMMMXJ12903CBF111': {'artist': 'Jorge Negrete',
  'title': 'El hijo del pueblo'},
 'TRMMMBW128F4260CAE': {'artist': 'Tiger Lou', 'title': 'Pilots'},
 'TRMMMXI128F4285A3F': {'artist': 'Waldemar Bastos', 'title': 'N Gana'},
 'TRMMMKI128F931D80D': {'artist': 'Lena Philipsson', 'title': '006'},
 'TRMMMUT128F42646E8': {'artist': 'Shawn Colvin',
  'title': '(Looking For) The Heart Of Saturday'},

### Creating a DataFrame for the 2 Files and Merging Them

#### DataFrame 1

In [30]:
dframe = {
    'TRACKID': list(all_songs_dict.keys()),
    'Text': [all_songs_dict[x] for x in all_songs_dict.keys()]
    }
msdid_df = pd.DataFrame.from_dict(dframe)
print(msdid_df.shape)
msdid_df.head()

(210519, 2)


Unnamed: 0,TRACKID,Text
0,TRAAAAV128F421A322,like like de got would seem someon understand ...
1,TRAAABD128F429CF47,know know know know know time time time la la ...
2,TRAAAED128E0783FAB,love love love love love love love love love l...
3,TRAAAEF128F4273421,know got got got feel let would would would ey...
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...


#### DataFrame 2

In [31]:
dframe2 = {
    'TRACKID': msdid_df['TRACKID'],
    'artist': [all_song_meta_dict[x]['artist'] for x in msdid_df['TRACKID']],
    'title': [all_song_meta_dict[x]['title'] for x in msdid_df['TRACKID']]
    }
meta_df = pd.DataFrame.from_dict(dframe2)
print(meta_df.shape)
meta_df.head()

(210519, 3)


Unnamed: 0,TRACKID,artist,title
0,TRAAAAV128F421A322,Western Addiction,A Poor Recipe For Civic Cohesion
1,TRAAABD128F429CF47,The Box Tops,Soul Deep
2,TRAAAED128E0783FAB,Jamie Cullum,It's About Time
3,TRAAAEF128F4273421,Adam Ant,Something Girls
4,TRAAAEW128F42930C0,Broken Spindles,Burn My Body (Album Version)


#### Merging

In [32]:
final_df = pd.merge(msdid_df, meta_df, on='TRACKID', how='left')

In [33]:
final_df.head()

Unnamed: 0,TRACKID,Text,artist,title
0,TRAAAAV128F421A322,like like de got would seem someon understand ...,Western Addiction,A Poor Recipe For Civic Cohesion
1,TRAAABD128F429CF47,know know know know know time time time la la ...,The Box Tops,Soul Deep
2,TRAAAED128E0783FAB,love love love love love love love love love l...,Jamie Cullum,It's About Time
3,TRAAAEF128F4273421,know got got got feel let would would would ey...,Adam Ant,Something Girls
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...,Broken Spindles,Burn My Body (Album Version)


## Model Training and Generating Word Embeddings

In [34]:
# Get Training Data
data = final_df['Text'].tolist()

In [36]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [39]:
train_corpus = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in list(enumerate(data))]

In [40]:
#Building Model
model = Doc2Vec(vector_size=50, min_count=1, epochs=10, dm=0)
model.build_vocab(train_corpus)

In [41]:
#Training Model
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [42]:
model.save('trained_model_example')

In [43]:
model = Doc2Vec.load('trained_model_example')

In [44]:
emb_df = pd.DataFrame([model.docvecs[f'{i}'] for i in range(len(final_df))])
fe_df = final_df.copy()
for c in emb_df.columns:
    fe_df[c] = emb_df[c]

In [48]:
fe_df.info

<bound method DataFrame.info of                    TRACKID                                               Text  \
0       TRAAAAV128F421A322  like like de got would seem someon understand ...   
1       TRAAABD128F429CF47  know know know know know time time time la la ...   
2       TRAAAED128E0783FAB  love love love love love love love love love l...   
3       TRAAAEF128F4273421  know got got got feel let would would would ey...   
4       TRAAAEW128F42930C0  like take would wo someth stay burn burn burn ...   
...                    ...                                                ...   
210514  TRZZZWS128F429CF87  que que que que que que que de en te te te te ...   
210515  TRZZZXA128F428ED56  time la get eye think give dream wo wo head so...   
210516  TRZZZXV128F4289747  know like like like time get get get never bac...   
210517  TRZZZYV128F92E996D  get get get get get get get get get get get ge...   
210518  TRZZZYX128F92D32C6  know know time time time go go go go go go go ...

In [50]:
fe_df.to_csv('Doc2VecEmbeddings.csv', index=False)

# WORD2VEC

In [58]:
word_lists

[['i', 'i', 'i', 'i'],
 ['the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the',
  'the'],
 ['to', 'to', 'to'],
 ['and', 'and', 'and', 'and', 'and', 'and'],
 ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'],
 ['me'],
 ['it', 'it', 'it', 'it', 'it'],
 ['not'],
 ['in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in'],
 ['my', 'my'],
 ['is', 'is', 'is', 'is', 'is', 'is', 'is', 'is', 'is'],
 ['of', 'of', 'of', 'of', 'of'],
 ['that', 'that', 'that', 'that'],
 ['on', 'on'],
 ['are', 'are', 'are', 'are'],
 ['we', 'we', 'we', 'we'],
 ['will', 'will'],
 ['all', 'all', 'all'],
 ['be', 'be', 'be'],
 ['know', 'know'],
 ['this', 'this', 'this'],
 ['with', 'with', 'with', 'with', 'with'],
 ['just', 'just', 'just'],
 ['when', 'when'],
 ['now', 'now'],
 ['time', 'time', 'time'],
 ['there', 'there'],
 ['go', 'go', 'go', 'go', 'go', 'go', 'go'],
 ['up', 'up'],
 ['they', 'they'],
 ['out', 'out'],


In [61]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [62]:
model2 = Word2Vec(word_lists, min_count=1,size= 50,workers=3, window =3, sg = 1)

In [78]:
model2['freeway']

  model2['freeway']


array([ 3.3840095e-03, -4.4054864e-03, -9.7944094e-03, -4.3173591e-03,
        1.5595024e-03,  7.4093193e-03,  7.3416973e-03,  6.5208215e-04,
        3.0503003e-03, -6.3301501e-04,  5.6095273e-06,  5.2574901e-03,
       -4.0683565e-03,  7.4931015e-03,  4.7563161e-03, -3.5820177e-03,
       -5.8123134e-03,  1.9687598e-03, -8.4946677e-03,  7.4363686e-03,
        4.9292510e-03,  2.4051864e-03, -5.3005372e-03, -8.9102164e-03,
        3.0138027e-03, -6.7384058e-04, -8.8066002e-03, -9.8001892e-03,
       -7.6274467e-03, -8.3928797e-03,  8.1424210e-03, -8.2012797e-03,
       -6.7775925e-03,  3.8658418e-03, -2.6700406e-03, -7.6875216e-03,
       -3.9550965e-03,  2.8430081e-03,  9.2689022e-03,  4.8152180e-03,
       -2.3634830e-03,  5.6392835e-03,  5.0677639e-04,  6.7160401e-04,
        1.9137472e-03, -9.8744798e-03, -8.1901914e-03, -2.5042188e-03,
        6.1770319e-03,  1.3296956e-03], dtype=float32)

In [79]:
model2['zoo']

  model2['zoo']


array([ 8.5522974e-04, -6.3166735e-03, -6.9513051e-03,  3.7799885e-03,
       -1.5991951e-03,  5.5573643e-03,  5.9341979e-03,  7.6739127e-03,
        9.4716800e-03, -8.3372677e-03, -3.6957236e-03,  6.0644546e-03,
        3.2513705e-03, -4.2479937e-03,  3.7034019e-03,  9.7203068e-03,
        6.1284322e-03,  8.8971499e-03,  6.3078855e-03,  2.0149725e-03,
        6.9063758e-03,  8.4457465e-04, -9.2334524e-03, -1.0857463e-03,
       -4.7662919e-03,  3.8335992e-03, -7.7826860e-03,  9.4485823e-03,
        3.5316560e-03,  5.7974518e-03, -7.2429758e-03, -2.4594979e-03,
        2.1971539e-03, -4.8305192e-03, -4.5652422e-03, -6.9431369e-03,
       -1.1417641e-03,  1.1282571e-04, -1.1685616e-05,  5.6417966e-03,
       -7.4185836e-03, -5.3891032e-03, -6.5380158e-03, -5.7706367e-03,
       -3.2658593e-04, -7.5059324e-03, -8.9857308e-03, -9.1335941e-03,
        5.7804799e-03,  5.5380967e-03], dtype=float32)