In [2]:
import nltk
import pandas as pd
import heapq
import pprint

from nltk.tokenize import wordpunct_tokenize, blankline_tokenize, line_tokenize
from itertools import combinations
from nltk.corpus import stopwords
from time import time 
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing

# # tensorflow
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Pytorch
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# stanza
import stanza as st

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# From https://www.depends-on-the-definition.com/guide-sequence-tagging-neural-networks-python/
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [3]:
data = pd.read_csv('lyrics_song_genres_15k.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artist         15000 non-null  object
 1   song_name      15000 non-null  object
 2   closest_genre  15000 non-null  object
 3   lyric          15000 non-null  object
 4   length_lyric   15000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 586.1+ KB


In [4]:
pos_tagger =  st.Pipeline(lang='en')

2021-02-25 12:20:04 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-25 12:20:04 INFO: Use device: gpu
2021-02-25 12:20:04 INFO: Loading: tokenize
2021-02-25 12:20:07 INFO: Loading: pos
2021-02-25 12:20:08 INFO: Loading: lemma
2021-02-25 12:20:08 INFO: Loading: depparse
2021-02-25 12:20:08 INFO: Loading: sentiment
2021-02-25 12:20:09 INFO: Loading: ner
2021-02-25 12:20:10 INFO: Done loading processors!


In [5]:
data.lyric[890]

"I'm sorry for the things I've done\nI know that I'm the foolish one\nNow that I see who's to blame\nI'm so ashamed, I'm sorry\n\nI'm sorry for the things I've said\nJust like a child I lost my head\nI should have known from the start\nI'd break your heart, I'm sorry\n\nPlease be kind and I know you'll find\nIts so easy to forgive\nDarling, wait for its not too late\nGive our love a chance to live\n\nI know the heartaches you've been through\nI know for I've had heartaches, too\nThere's nothing more I can do\nBut say to you I'm sorry\n\nDarling, I'm truly sorry"

In [6]:
line_tokenize(data.lyric[0])

["it's a junkie dream makes you so uptight",
 "yeah it's halloween tonight and every night",
 'see you scratch (see it on) your skin',
 'your sandpaper throat',
 "you're a symphony, man, with one fucking note",
 'how they beat you up week after week',
 "and when you grow up you're going to be a freak",
 "want a violent girl who's not scared of anything",
 'help me kill my time',
 "'cause I'll never be fine",
 'help me kill my time',
 'you went down to look at old dallas town',
 'where you must be sick just to hang around',
 'seen it on tv how to kill your man',
 "then like gacy's scene a canvas in your hand",
 "you better call your mom she's out looking for you",
 'in the jail and the army and the hospital too',
 "but those people there couldn't do anything for you",
 'help me kill my time',
 "'cause I'll never be fine",
 'help me kill my time',
 'help me kill my time',
 'help me kill my time',
 "'cause I'll never be fine",
 'help me kill my time']

In [7]:
len(line_tokenize(data.lyric[0]))

25

In [8]:
lines = '. '.join(line_tokenize(data.lyric[0]))

In [9]:
lines = '. '.join(line_tokenize(data.lyric[0]))
doc = pos_tagger(lines)
print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

word: it	upos: PRON	xpos: PRP	feats: Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
word: 's	upos: AUX	xpos: VBZ	feats: Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
word: a	upos: DET	xpos: DT	feats: Definite=Ind|PronType=Art
word: junkie	upos: NOUN	xpos: NN	feats: Number=Sing
word: dream	upos: NOUN	xpos: NN	feats: Number=Sing
word: makes	upos: VERB	xpos: VBZ	feats: Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
word: you	upos: PRON	xpos: PRP	feats: Case=Acc|Person=2|PronType=Prs
word: so	upos: ADV	xpos: RB	feats: _
word: uptight	upos: ADJ	xpos: JJ	feats: Degree=Pos
word: .	upos: PUNCT	xpos: .	feats: _
word: yeah	upos: INTJ	xpos: UH	feats: _
word: it	upos: PRON	xpos: PRP	feats: Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
word: 's	upos: AUX	xpos: VBZ	feats: Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
word: halloween	upos: NOUN	xpos: NN	feats: Number=Sing
word: tonight	upos: NOUN	xpos: NN	feats: Number=Sing
word: and	upos: CCONJ	xpos: CC	feats:

In [10]:
len(doc.sentences)

25

In [11]:
# Get the max length of the sentence

for lyrics in data.lyric:
    print(lyrics)
    break

it's a junkie dream makes you so uptight
yeah it's halloween tonight and every night
see you scratch (see it on) your skin
your sandpaper throat
you're a symphony, man, with one fucking note
how they beat you up week after week
and when you grow up you're going to be a freak
want a violent girl who's not scared of anything
help me kill my time
'cause I'll never be fine
help me kill my time
you went down to look at old dallas town
where you must be sick just to hang around
seen it on tv how to kill your man
then like gacy's scene a canvas in your hand
you better call your mom she's out looking for you
in the jail and the army and the hospital too
but those people there couldn't do anything for you
help me kill my time
'cause I'll never be fine
help me kill my time
help me kill my time
help me kill my time
'cause I'll never be fine
help me kill my time


In [12]:
doc.sentences[0].print_dependencies()

('it', 6, 'nsubj')
("'s", 5, 'cop')
('a', 5, 'det')
('junkie', 5, 'compound')
('dream', 6, 'nsubj')
('makes', 0, 'root')
('you', 6, 'obj')
('so', 9, 'advmod')
('uptight', 6, 'xcomp')
('.', 6, 'punct')


In [13]:
len(temp.sentences)

NameError: name 'temp' is not defined

In [None]:
temp_ln = line_tokenize(data.lyric[0])
index = max(range(len(temp_ln)), key=lambda i: len(temp_ln[i]))

In [3]:
doc = line_tokenize(data.lyric[0])
index = max(range(len(doc)), key=lambda i: len(word_tokenize(doc[i])))
curr_max_length = len(word_tokenize(doc[index]))

NameError: name 'data' is not defined

In [None]:
len(temp_ln[index])

In [None]:
index

In [None]:
temp = pos_tagger(line_tokenize(data.lyric[0])[0])

In [40]:
[print(k) for k in dir(temp.sentences[0]) if not(k.startswith('__'))]

_dependencies
_doc
_ents
_process_tokens
_sentiment
_text
_tokens
_words
add_property
build_dependencies
build_ents
dependencies
dependencies_string
doc
entities
ents
print_dependencies
print_tokens
print_words
sentiment
text
to_dict
tokens
tokens_string
words
words_string


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [52]:
help(temp.sentences[0].print_dependencies)

Help on method print_dependencies in module stanza.models.common.doc:

print_dependencies(file=None) method of stanza.models.common.doc.Sentence instance
    Print the dependencies for this sentence.



In [65]:
temp.sentences[0].words_string()

"<Word id=1;text=it;lemma=it;upos=PRON;xpos=PRP;feats=Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs;head=5;deprel=nsubj>\n<Word id=2;text='s;lemma=be;upos=AUX;xpos=VBZ;feats=Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin;head=5;deprel=cop>\n<Word id=3;text=a;lemma=a;upos=DET;xpos=DT;feats=Definite=Ind|PronType=Art;head=5;deprel=det>\n<Word id=4;text=junkie;lemma=junkie;upos=NOUN;xpos=NN;feats=Number=Sing;head=5;deprel=compound>\n<Word id=5;text=dream;lemma=dream;upos=NOUN;xpos=NN;feats=Number=Sing;head=0;deprel=root>\n<Word id=6;text=makes;lemma=make;upos=VERB;xpos=VBZ;feats=Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin;head=5;deprel=parataxis>\n<Word id=7;text=you;lemma=you;upos=PRON;xpos=PRP;feats=Case=Acc|Person=2|PronType=Prs;head=6;deprel=obj>\n<Word id=8;text=so;lemma=so;upos=ADV;xpos=RB;head=9;deprel=advmod>\n<Word id=9;text=uptight;lemma=uptight;upos=ADJ;xpos=JJ;feats=Degree=Pos;head=6;deprel=xcomp>"

In [137]:
song15kmodel = Word2Vec.load('fifteenklyricswv_w5ns5_notcleaned_nopad.model')
song15kmodel.wv.similar_by_word('man')

[('holidays', 0.23732487857341766),
 ('fabrics', 0.21847498416900635),
 ('exhibition', 0.21591143310070038),
 ('addict', 0.21228137612342834),
 ('treats', 0.20989646017551422),
 ('uhuh', 0.20820219814777374),
 ('eleven', 0.20598238706588745),
 ('mango', 0.20177169144153595),
 ('impatient', 0.20071011781692505),
 ('awayoh', 0.20048710703849792)]

In [140]:
song15kmodel = Word2Vec.load('fifteenklyricswv_w5ns5_notcleaned_nopad.model')
song15kmodel.wv.similar_by_word('bird', topn=20)

[('loading', 0.2527003288269043),
 ('peel', 0.23208048939704895),
 ('knows', 0.21555384993553162),
 ('suppers', 0.20017489790916443),
 ('tiger', 0.19861142337322235),
 ('hospital', 0.1912660002708435),
 ('legal', 0.18773692846298218),
 ('keyhole', 0.18468114733695984),
 ('latest', 0.183905690908432),
 ('vuelve', 0.18309897184371948),
 ('report', 0.18307459354400635),
 ('pouder', 0.17987167835235596),
 ('airwaves', 0.17927488684654236),
 ('ooooooooooo', 0.17830637097358704),
 ('bonny', 0.17614570260047913),
 ('fki', 0.17611578106880188),
 ('while', 0.17574569582939148),
 ('snitchin', 0.17409491539001465),
 ('martyr', 0.1732156127691269),
 ('protein', 0.17196537554264069)]

In [143]:
# Pretrained word2vec
import gensim.downloader as api
corpus = api.load('fasttext-wiki-news-subwords-300', return_path=True)
pretrainedwvmodel = KeyedVectors.load_word2vec_format(corpus)

In [4]:
pretrainedwvmodel.wv.similar_by_word("its", topn=20)

NameError: name 'pretrainedwvmodel' is not defined

In [141]:
song15kmodel = Word2Vec.load('fifteenklyricswv_w5ns5_notcleaned_nopad.model')
song15kmodel.wv.similar_by_word('bird', topn=20)

[('rid', 0.21196430921554565),
 ('godiva', 0.2083100825548172),
 ('bikinis', 0.19742251932621002),
 ('presidential', 0.1966007500886917),
 ('planting', 0.19571252167224884),
 ('soho', 0.19409094750881195),
 ('aloud', 0.18977567553520203),
 ('fool', 0.1885804980993271),
 ('rudolph', 0.18847893178462982),
 ('slums', 0.1880977302789688),
 ('special', 0.18677836656570435),
 ('niece', 0.18477065861225128),
 ('mouths', 0.18386365473270416),
 ('refrain', 0.1827993392944336),
 ('cookies', 0.18100234866142273),
 ('cracker', 0.17837102711200714),
 ('licking', 0.17781764268875122),
 ('intuition', 0.17774252593517303),
 ('hallways', 0.1756049394607544),
 ('longhair', 0.17420703172683716)]

In [142]:
song15kmodel = Word2Vec.load('fifteenklyricswv_w5ns5_notcleaned_nopad_2.model')
song15kmodel.wv.similar_by_word('bird', topn=20)

[('chino', 0.19984319806098938),
 ('monster', 0.19773530960083008),
 ('physical', 0.19449383020401),
 ('rival', 0.19220104813575745),
 ('baer', 0.18757012486457825),
 ('simultaneous', 0.18317291140556335),
 ('cielo', 0.179951012134552),
 ('scattered', 0.1793053299188614),
 ('till', 0.1791701316833496),
 ('polaroid', 0.1772567331790924),
 ('domino', 0.17568251490592957),
 ('crafty', 0.17424935102462769),
 ('scandalous', 0.17367053031921387),
 ('chat', 0.17321452498435974),
 ('rockin', 0.1712748408317566),
 ('malt', 0.1703069508075714),
 ('sussed', 0.17029058933258057),
 ('monday', 0.16919627785682678),
 ('legend', 0.1690933257341385),
 ('walker', 0.16702818870544434)]

In [115]:
song15kmodel.wv.similar_by_word('paddingkosong')

[('wrists', 0.2525196075439453),
 ('theyre', 0.196501687169075),
 ('blackburn', 0.19630208611488342),
 ('their', 0.19494585692882538),
 ('helter', 0.19020254909992218),
 ('bogle', 0.1898348033428192),
 ('coloured', 0.18793581426143646),
 ('revealing', 0.1860581636428833),
 ('scaring', 0.18553043901920319),
 ('coffee', 0.18407416343688965)]

In [124]:
song15kmodel.wv.index2word[0]

'i'

In [153]:
np.shape(pretrainedwvmodel.wv.vectors)
pretrained_embedding_matrix = pretrainedwvmodel.wv.vectors
pretrained_embedding_matrix.append(np.zeros((300,))) # Padding

In [155]:
np.shape(pretrained_embedding_matrix[0])

(300,)

In [157]:
np.zeros((300,))

(300,)

In [126]:
embedding_matrix = torch.FloatTensor(song15kmodel.wv.vectors)
embedding_matrix.shape

torch.Size([17678, 300])

In [128]:
type(song15kmodel.wv.vectors)

numpy.ndarray

In [130]:
len(song15kmodel.wv.vectors[0])

300

In [127]:
embedding_matrix

tensor([[ 5.6029e-04,  6.7484e-04, -1.1981e-03,  ...,  1.3301e-03,
          3.6637e-04,  4.2053e-04],
        [ 1.2134e-03, -5.0414e-04, -1.1669e-03,  ..., -1.2215e-03,
          1.0278e-03,  1.1553e-03],
        [-4.1564e-04,  1.5681e-03, -3.5766e-05,  ..., -1.4893e-03,
         -4.5380e-04, -3.9659e-05],
        ...,
        [-3.4245e-04,  1.0132e-03, -8.8864e-04,  ..., -7.8351e-04,
          1.0947e-03,  1.2806e-03],
        [-8.4348e-04, -2.5223e-04,  3.0563e-04,  ...,  6.7783e-04,
          1.5668e-03, -7.5886e-04],
        [ 7.4834e-04,  8.6166e-04, -1.4375e-03,  ...,  8.2994e-04,
          2.1054e-04,  3.0171e-04]])

In [113]:
def sentence_encoder(text, vocab2index):
    tokens = text.lower().split()
    temp = np.full(len(vocab2index), len(vocab2index)+1)
    sentencoded = np.array([vocab2index.get(word, len(vocab2index)+1) for word in tokens])
    length = len(sentencoded)
    temp[:length] = sentencoded
    return temp

In [25]:
k = []
l = [1,2,3,4]
k + l

[1, 2, 3, 4]

In [28]:
dir(doc.sentences[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_dependencies',
 '_doc',
 '_ents',
 '_process_tokens',
 '_sentiment',
 '_text',
 '_tokens',
 '_words',
 'add_property',
 'build_dependencies',
 'build_ents',
 'dependencies',
 'dependencies_string',
 'doc',
 'entities',
 'ents',
 'print_dependencies',
 'print_tokens',
 'print_words',
 'sentiment',
 'text',
 'to_dict',
 'tokens',
 'tokens_string',
 'words',
 'words_string']

In [37]:
for i, k in data.iterrows():
    print(i)
    break

0


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artist         15000 non-null  object
 1   song_name      15000 non-null  object
 2   closest_genre  15000 non-null  object
 3   lyric          15000 non-null  object
 4   length_lyric   15000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 586.1+ KB


In [42]:
lyrics_pos_tagger = {
    'artist':[],
    'song_name':[],
    'song_id':[],
    'word':[],
    'XPOS':[],
    'UPOS':[],
}

max_length_sent = 0
for song_id, song_info in data.iterrows():
    lines = '. '.join(line_tokenize(song_info.lyric))
    doc = pos_tagger(lines)
    for sent in doc.sentences:
        curr_max_length = 0
        for word in sent.words:
            curr_max_length += 1
            lyrics_pos_tagger['artist'].append(song_info.artist)
            lyrics_pos_tagger['song_name'].append(song_info.song_name)
            lyrics_pos_tagger['song_id'].append(song_id)
            lyrics_pos_tagger['word'].append(word.text)
            lyrics_pos_tagger['XPOS'].append(word.xpos)
            lyrics_pos_tagger['UPOS'].append(word.upos)
        max_length_sent = max(max_length_sent, curr_max_length)


KeyboardInterrupt: 