# Lyrics project

## Data acquisition

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_json("file://localhost/gda/data/songs_lyrics.json").reset_index(drop=True)

df

URLError: <urlopen error [WinError 3] The system cannot find the path specified: '\\gda\\data\\songs_lyrics.json'>

## Making the dataframe balanced

In [None]:
df["artist"].value_counts()

In [None]:
# Getting all unique artist names
artists = df["artist"].unique()

In [None]:
# Getting minimum count of lyrics from all artists

df["artist"].value_counts().min()

In [None]:
# Creating balanced DataFrame
df_balanced = pd.DataFrame()

# Computing which artist does have lowest count of available lyrics
artist_min = df["artist"].value_counts().min()

# Getting the first <artist_min> rows from all artists into balanced DataFrame
df_balanced = df_balanced.append(
    [
        df.loc[df["artist"]==artist].iloc[0:artist_min]
        for artist in artists
        ]
    )

In [None]:
df_balanced["artist"].value_counts()

Queen            91
Eminem           91
Justin-Bieber    91
Deicide          91
Name: artist, dtype: int64

## Tokenization

In [1]:
import spacy
from spacy.lang.en import English

# if importing fails, uncomment and run lines below to install the necessary stuff

#!pip install -U spacy
#!python -m spacy download en_core_web_sm

# Create the nlp object
nlp = spacy.load("en_core_web_sm")

2021-09-30 06:49:51.227084: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-30 06:49:51.227160: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
# Getting info about currently loaded pipeline
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fb86ec57f50>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fb86ec67f50>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fb86fb44fa0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fb86ec50550>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fb86ecce960>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fb86fb44de0>)]


In [None]:
# This is my attempt at creating my own component for the NLP pipeline

#@Language.component("cleaner")
#def cleaner(doc):
#
#    token_ for token in doc
#
#    return doc

In [None]:
lyrics = {
    "tokens"        : [],
    "tokens_clean"  : [],
    "tokens_lemmed" : []
}

In [None]:
type(lyrics["tokens_clean"])

list

In [None]:
[lyrics["tokens"].append([token for token in doc]) 
    for doc in nlp.pipe(df_balanced["song_text"])]
[lyrics["tokens_clean"].append([token.lower_
    for token in doc if (not(token.is_punct) and not(token.is_stop) and not(token.is_space))]
    )
    for doc in nlp.pipe(df_balanced["song_text"])]
           
        

print(lyrics["tokens"][0])
print(lyrics["tokens_clean"][0])



[Servants, of, death, ,, enchanter, of, pain, 
, From, the, land, of, no, return, ,, you, 'll, kill, again, 
, Smear, the, blood, on, naked, corpse, 
, Manson, ., 

, Lunatic, of, God, 's, creation, 
, No, resist, 
, Hear, the, voices, of, devastation, 

, There, is, darkness, in, his, eyes, 
, And, you, wo, n't, see, it, ,, before, you, die, 
, Feel, the, knife, of, the, Lord, Divine, 's, 
, Creation, ., 

, Lunatic, of, God, 's, creation, 
, No, resist, 
, Hear, the, voices, of, devastation]
['servants', 'of', 'death', ',', 'enchanter', 'of', 'pain', '\n', 'from', 'the', 'land', 'of', 'no', 'return', ',', 'you', "'ll", 'kill', 'again', '\n', 'smear', 'the', 'blood', 'on', 'naked', 'corpse', '\n', 'manson', '.', '\n\n', 'lunatic', 'of', 'god', "'s", 'creation', '\n', 'no', 'resist', '\n', 'hear', 'the', 'voices', 'of', 'devastation', '\n\n', 'there', 'is', 'darkness', 'in', 'his', 'eyes', '\n', 'and', 'you', 'wo', "n't", 'see', 'it', ',', 'before', 'you', 'die', '\n', 'feel', 'the', '

In [None]:
lyrics["tokens"][0]

[Servants,
 of,
 death,
 ,,
 enchanter,
 of,
 pain,
 ,
 From,
 the,
 land,
 of,
 no,
 return,
 ,,
 you,
 'll,
 kill,
 again,
 ,
 Smear,
 the,
 blood,
 on,
 naked,
 corpse,
 ,
 Manson,
 .,
 
 ,
 Lunatic,
 of,
 God,
 's,
 creation,
 ,
 No,
 resist,
 ,
 Hear,
 the,
 voices,
 of,
 devastation,
 
 ,
 There,
 is,
 darkness,
 in,
 his,
 eyes,
 ,
 And,
 you,
 wo,
 n't,
 see,
 it,
 ,,
 before,
 you,
 die,
 ,
 Feel,
 the,
 knife,
 of,
 the,
 Lord,
 Divine,
 's,
 ,
 Creation,
 .,
 
 ,
 Lunatic,
 of,
 God,
 's,
 creation,
 ,
 No,
 resist,
 ,
 Hear,
 the,
 voices,
 of,
 devastation]

In [None]:
for doc in nlp.pipe(df_balanced["song_text"]):
    print(doc)
    for token in doc:
        lyrics["tokens"].append(token)
        print(token)
        if not(token.is_punct) and not(token.is_stop) and not(token.is_space):
            lyrics["tokens_clean"].append(token.lower())
            lyrics["tokens_lemmed"].append(token.lemma_.lower())

Servants of death, enchanter of pain
From the land of no return, you'll kill again
Smear the blood on naked corpse
Manson.

Lunatic of God's creation
No resist
Hear the voices of devastation

There is darkness in his eyes
And you won't see it, before you die
Feel the knife of the Lord Divine's
Creation.

Lunatic of God's creation
No resist
Hear the voices of devastation
Servants


TypeError: 'int' object is not callable

In [None]:
list(lyrics["tokens"])

[Servants]

In [None]:
lyrics

{'tokens': [Servants], 'tokens_clean': [], 'tokens_lemmed': []}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cfd8b0ae-bede-4fbf-8370-46f1acdfcc89' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>