In [6]:
import re
import pickle
import string
from urllib.parse import unquote, quote_plus
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.grid'] = True
plt.rcParams['legend.fontsize'] = 14
%matplotlib inline

import nltk
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import words as nltk_words
from nltk.tag import StanfordNERTagger

from scipy.stats import skew as spskew
from scipy.stats import kurtosis as spkurt
from sklearn.preprocessing import minmax_scale, scale


In [2]:
df = pd.read_hdf('song-lyrics-genres-reviews.h5', key='df', mode='r')

# Vocabulary

In [3]:
english_words = set(nltk_words.words())

In [4]:
bands = sorted(set(df.band))
df_bands = pd.DataFrame(index=np.arange(len(bands)), columns=['band', 'lyrics_raw', 'lyrics', 'words', 'genres', 'reviews'])
for i, band in enumerate(bands):
    lyrics_raw = []
    df_band = df[df.band == band]
    genres = df_band.genres.iloc[0]
    songs = df_band.lyrics.values
    reviews = df_band.reviews.values[0]
    for song in songs:
        lyrics_raw += song
    lyrics_processed = ' '.join([re.sub(r"[^\w\d'\s]+", '', word) for line in lyrics_raw for word in line.split()])
    words = lyrics_processed.split()
    words = [word for word in words if word.lower() in english_words]
    df_bands.iloc[i] = dict(band=band, lyrics_raw=lyrics_raw, lyrics=lyrics_processed, words=words, genres=genres, reviews=reviews)
df_bands = df_bands[df_bands.words.apply(len) > 1000]
df_bands['word_count'] = df_bands.words.apply(len)
df_bands['unique_count'] = df_bands.words.apply(lambda x: len(set(x)))
df_bands['unique_pct'] = df_bands.unique_count / df_bands.word_count

df_bands = df_bands[
    (df_bands.word_count > df_bands.word_count.median()) &
    (df_bands.reviews > df_bands.reviews.median())
]

In [173]:
# df_bands.words[df_bands.band == 'helloween'].values

In [6]:
df_bands.sort_values('word_count')[::-1]

Unnamed: 0,band,lyrics_raw,lyrics,words,genres,reviews,word_count,unique_count,unique_pct
312,helloween,"[You're hanging around and got nuthin' to do, ...",You're hanging around and got nuthin' to do Yo...,"[hanging, around, and, got, to, do, You, get, ...","[power, speed]",189,47020,3306,0.070311
531,rage,[Oh no! Somebody said they're marching out ton...,Oh no Somebody said they're marching out tonig...,"[Oh, no, Somebody, said, out, tonight, high, t...","[heavy, power, speed]",86,43490,3104,0.071373
496,overkill,"[Riding the wind on a stormy night, Rides a mo...",Riding the wind on a stormy night Rides a moth...,"[Riding, the, wind, on, a, stormy, night, a, s...","[groove, thrash]",199,40474,3595,0.088822
564,saxon,"[Far below the valleys, Hidden deep beneath th...",Far below the valleys Hidden deep beneath the ...,"[Far, below, the, Hidden, deep, beneath, the, ...","[heavy, nwobhm]",116,39693,2881,0.072582
48,anthrax,"[Riding hard, high in the saddle, Winged steed...",Riding hard high in the saddle Winged steed of...,"[Riding, hard, high, in, the, saddle, Winged, ...","[groove, speed, thrash]",142,39350,3412,0.086709
...,...,...,...,...,...,...,...,...,...
231,enforcer,"[Looking out into the silent night, And see th...",Looking out into the silent night And see the ...,"[Looking, out, into, the, silent, night, And, ...","[heavy, speed]",35,6599,1316,0.199424
443,mortiis,"[Han var født før egen tid,, Men han vandret a...",Han var født før egen tid Men han vandret alik...,"[Han, tid, Men, han, land, han, sin, i, for, g...","[ambient, darkwave, industrial, rock]",21,6539,1136,0.173727
306,havok,"[I feel so trapped in this building of hell, W...",I feel so trapped in this building of hell Why...,"[I, feel, so, trapped, in, this, building, of,...",[thrash],32,6523,1438,0.220451
415,melechesh,[All will sigh when they realize we dwell in t...,All will sigh when they realize we dwell in th...,"[All, will, sigh, when, they, realize, we, dwe...","[black, death, eastern, folk, middle]",36,6521,1578,0.241987


In [7]:
df_bands.sort_values('unique_count')[::-1]

Unnamed: 0,band,lyrics_raw,lyrics,words,genres,reviews,word_count,unique_count,unique_pct
583,skyclad,[O come ye young of Hamlyn--you who know my tu...,O come ye young of Hamlynyou who know my tune ...,"[O, come, ye, young, of, who, know, my, tune, ...",[folk],61,33743,4583,0.135821
590,sodom,"[It's time to die, Death stands behind the doo...",It's time to die Death stands behind the door ...,"[time, to, die, Death, behind, the, door, Sata...","[black, speed, thrash]",164,27507,4431,0.161086
615,tankard,"[I'm just hanging around tonight, just watchin...",I'm just hanging around tonight just watching ...,"[just, hanging, around, tonight, just, watchin...",[thrash],91,32538,3960,0.121704
414,megadeth,"[Your bodies empty now, As I hold you, Now you...",Your bodies empty now As I hold you Now your g...,"[Your, empty, now, As, I, hold, you, Now, your...","[heavy, rock, speed, thrash]",339,31771,3858,0.121431
252,exhumed,"[Gurgling oblique aponeurosis is split, As I i...",Gurgling oblique aponeurosis is split As I inf...,"[Gurgling, oblique, aponeurosis, is, split, As...","[death, grindcore]",33,17138,3688,0.215194
...,...,...,...,...,...,...,...,...,...
486,obituary,"[Rot Alone. Destiny,, killing the souls of liv...",Rot Alone Destiny killing the souls of lives a...,"[Rot, Alone, Destiny, killing, the, of, at, yo...",[death],105,7582,1231,0.162358
598,stormwarrior,"[Thunder roars, lightning strikes, A clash of ...",Thunder roars lightning strikes A clash of ste...,"[Thunder, lightning, A, clash, of, the, collid...","[power, speed]",20,9410,1175,0.124867
122,carnifex,"[I'll bury this for just one more day, Hide it...",I'll bury this for just one more day Hide it d...,"[bury, this, for, just, one, more, day, Hide, ...",[deathcore],36,9591,1161,0.121051
443,mortiis,"[Han var født før egen tid,, Men han vandret a...",Han var født før egen tid Men han vandret alik...,"[Han, tid, Men, han, land, han, sin, i, for, g...","[ambient, darkwave, industrial, rock]",21,6539,1136,0.173727


In [8]:
df_bands.sort_values('unique_pct')[::-1]

Unnamed: 0,band,lyrics_raw,lyrics,words,genres,reviews,word_count,unique_count,unique_pct
545,revocation,"[Righteousness reaching out cunning hands,, Mo...",Righteousness reaching out cunning hands Moldi...,"[Righteousness, reaching, out, cunning, Moldin...","[death, technical, thrash]",26,7802,2480,0.317867
419,meshuggah,"[Hear the mourners, The earth is gasping for a...",Hear the mourners The earth is gasping for air...,"[Hear, the, The, earth, is, gasping, for, air,...","[djent, groove, technical, thrash]",121,11265,3382,0.300222
337,incantation,"[How feeble thy man hast come forth unto us., ...",How feeble thy man hast come forth unto us To ...,"[How, feeble, thy, man, come, forth, unto, us,...",[death],45,6660,1945,0.292042
61,asphyx,"[Hordes of disgust, Hungrs, hasty, roaming at ...",Hordes of disgust Hungrs hasty roaming at nigh...,"[of, disgust, hasty, roaming, at, night, Red, ...","[death, doom]",55,8911,2578,0.289305
152,cryptopsy,"[Oh what a gal!, She seems such a perfect vict...",Oh what a gal She seems such a perfect victim ...,"[Oh, what, a, gal, She, such, a, perfect, vict...","[brutal, death, deathcore, technical]",123,10384,2915,0.280720
...,...,...,...,...,...,...,...,...,...
564,saxon,"[Far below the valleys, Hidden deep beneath th...",Far below the valleys Hidden deep beneath the ...,"[Far, below, the, Hidden, deep, beneath, the, ...","[heavy, nwobhm]",116,39693,2881,0.072582
531,rage,[Oh no! Somebody said they're marching out ton...,Oh no Somebody said they're marching out tonig...,"[Oh, no, Somebody, said, out, tonight, high, t...","[heavy, power, speed]",86,43490,3104,0.071373
312,helloween,"[You're hanging around and got nuthin' to do, ...",You're hanging around and got nuthin' to do Yo...,"[hanging, around, and, got, to, do, You, get, ...","[power, speed]",189,47020,3306,0.070311
278,galneryus,"[In the future,, the human race is dropped in ...",In the future the human race is dropped in the...,"[In, the, future, the, human, race, is, in, th...","[neoclassical, power]",28,28030,1885,0.067249


In [5]:
genres = sorted(set(df_bands.genres.sum()))
genres = [genre for genre in genres if df_bands.genres.sum().count(genre) > 10]
df_genres = pd.DataFrame(index=np.arange(len(genres)), columns=['genre', 'words'])
for i, genre in enumerate(genres):
    words = []
    for _, row in df_bands.iterrows():
        if genre in row.genres:
            words += row.words
    df_genres.iloc[i] = dict(genre=genre, words=words)
df_genres['word_count'] = df_genres.words.apply(len)
df_genres['unique_count'] = df_genres.words.apply(lambda x: len(set(x)))
df_genres['unique_pct'] = df_genres.unique_count / df_genres.word_count

In [10]:
df_genres.sort_values('unique_pct')[::-1]

Unnamed: 0,genre,words,word_count,unique_count,unique_pct
12,technical,"[Anger, sown, once, in, not, buried, past, Hat...",144378,12215,0.084604
3,folk,"[Many, ago, in, a, faraway, land, We, met, an,...",202455,12087,0.059702
2,doom,"[The, sound, of, thousand, The, of, thousand, ...",160734,9065,0.056398
4,groove,"[it, frightening, you, it, wonderful, Living, ...",277629,12671,0.04564
11,symphonic,"[I, foresee, of, chaos, But, my, ordeal, yet, ...",240765,10653,0.044246
6,melodic,"[The, sound, of, thousand, The, of, thousand, ...",399016,14616,0.03663
8,progressive,"[I, foresee, of, chaos, But, my, ordeal, yet, ...",414412,15135,0.036522
0,black,"[The, holy, trinity, is, rotten, All, the, are...",521338,18757,0.035979
10,speed,"[it, frightening, you, it, wonderful, Living, ...",404766,14036,0.034677
9,rock,"[The, sound, of, thousand, The, of, thousand, ...",399195,12531,0.031391


# Named entity recognition

In [7]:
for genre in genres:
    genre_words = []
    for i in range(0, 100000, 100):
        genre_words += df_genres.words[df_genres.genre == genre].iloc[0][i: i + 100]
    genre_tokens = word_tokenize(' '.join(genre_words))
    genre_pos_tags = nltk.pos_tag(genre_tokens)
    ne_tree = ne_chunk(genre_pos_tags)
    entities = [x.leaves()[0][0] for x in ne_tree if hasattr(x, 'leaves')]
    entity_counts = sorted([(x, entities.count(x)) for x in sorted(set(entities))], key=lambda x: x[1])[::-1]
    print(genre)
    print(entity_counts[:5])

black
[('Thy', 49), ('Satan', 48), ('Earth', 34), ('Hell', 33), ('Blood', 30)]
death
[('Into', 38), ('Death', 24), ('Blood', 24), ('Hell', 17), ('Sun', 16)]
doom
[('Black', 60), ('Into', 40), ('Blood', 32), ('Come', 29), ('Death', 25)]
folk
[('Och', 38), ('Far', 38), ('De', 29), ('No', 24), ('Hey', 24)]
groove
[('Alison', 33), ('Law', 28), ('Hell', 27), ('Fight', 27), ('Death', 27)]
heavy
[('Metal', 53), ('Generation', 36), ('Blood', 36), ('Hell', 34), ('Hard', 28)]
melodic
[('Into', 32), ('Come', 26), ('Blood', 19), ('Will', 18), ('Trapped', 18)]
power
[('Metal', 40), ('No', 37), ('Come', 35), ('Blood', 35), ('Will', 31)]
progressive
[('Into', 40), ('Will', 33), ('Come', 26), ('THE', 25), ('Carry', 17)]
rock
[('Into', 30), ('Baby', 27), ('THE', 25), ('Love', 24), ('Blood', 23)]
speed
[('Metal', 51), ('Hell', 43), ('Black', 33), ('Alison', 33), ('Death', 32)]
symphonic
[('Will', 38), ('Dark', 36), ('THE', 29), ('Darkness', 26), ('Tell', 21)]
technical
[('Alison', 33), ('THE', 30), ('Wh

In [34]:
sentence = """
Fear can be seen on their tearful faces
Followers of God will meet their doom
Our only justice is the sword
Our only sentence is death
Vultures follow us to finish the work
Terror is spread to begin the Armageddon...
"""
sentence_tokens = word_tokenize(sentence)
sentence_pos_tags = nltk.pos_tag(sentence_tokens)