In [1]:
import re
import pickle
from urllib.parse import unquote, quote_plus
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.grid'] = True
plt.rcParams['legend.fontsize'] = 14
%matplotlib inline

from scipy.stats import skew as spskew
from scipy.stats import kurtosis as spkurt
from sklearn.preprocessing import minmax_scale, scale

In [2]:
df = pd.read_hdf('song-lyrics-genres.h5', key='df', mode='r')

In [92]:
bands = sorted(set(df.band))
df_bands = pd.DataFrame(index=np.arange(len(bands)), columns=['band', 'lyrics', 'genres'])
for i, band in enumerate(bands):
    lyrics = []
    df_band = df[df.band == band]
    genres = df_band.genres.iloc[0]
    songs = df_band.lyrics.values
    for song in songs:
        lyrics += song.split()
    df_bands.iloc[i] = dict(band=band, lyrics=lyrics, genres=genres)
df_bands = df_bands[df_bands.lyrics.apply(len) > 1000]
df_bands['unique'] = df_bands.lyrics.apply(lambda x: len(set(x)))
df_bands['unique_pct'] = df_bands.lyrics.apply(lambda x: len(set(x)) / len(x))

In [93]:
df_bands.sort_values('unique')[::-1]

Unnamed: 0,band,lyrics,genres,unique,unique_pct
428,misanthrope,"[changing, abstract, body, try, make, communio...","[death, melodic, progressive]",6745,0.437362
679,vintersorg,"[trolskt, vargaland, insvept, höstlig, mantel,...","[folk, progressive, viking]",6655,0.569875
586,skyclad,"[young, hamlyn, know, tune, well, beckons, mus...",[folk],5628,0.283941
676,viikate,"[jälleen, tullut, aika, todeta, tola, koivuhal...","[heavy, melodic, rautalanka]",5615,0.412352
372,korpiklaani,"[underground, never, seen, sun, really, know, ...",[folk],5554,0.464342
...,...,...,...,...,...
545,repulsion,"[desecration, earth, massive, death, breathes,...","[death, grindcore]",534,0.526108
207,dolorian,"[pale, shadows, front, colours, cannot, percei...","[ambient, black, doom, ritual]",463,0.402609
547,revenge,"[gonna, make, feel, right, time, gonna, make, ...","[black, death]",457,0.327599
668,vectom,"[take, golden, future, fighting, justice, let,...",[speed],364,0.345024


In [94]:
df_bands.sort_values('unique_pct')[::-1]

Unnamed: 0,band,lyrics,genres,unique,unique_pct
397,lugubrum,"[uitgestrekte, armen, bevroren, pose, krampach...","[black, experimental]",1476,0.852686
562,sarke,"[moon, leading, path, hand, knife, bone, rope,...","[black, thrash]",758,0.749012
362,katalepsy,"[wounds, compound, fractures, internal, organs...","[brutal, death, groove, technical]",751,0.733398
298,gris,"[rain, rode, ethereal, forests, eternal, cold,...",[black],846,0.724315
366,kauan,"[вместо, слёз, глотаю, собственную, кровь, вме...","[doom, folk, post-rock]",1224,0.717889
...,...,...,...,...,...
566,savatage,"[heard, voices, sing, storm, island, closing, ...","[heavy, power, progressive, rock]",2776,0.137815
567,saxon,"[far, valleys, hidden, deep, beneath, snow, gu...","[heavy, nwobhm]",2988,0.136370
212,dragonforce,"[cold, dark, winter, night, hidden, stormy, li...",[power],1567,0.133612
206,dokken,"[wake, morning, telephone, try, talk, say, hom...","[hard, heavy, rock]",1692,0.128983


In [35]:
genres = sorted(set(df_bands.genres.sum()))
genres = [genre for genre in genres if df_bands.genres.sum().count(genre) > 20]
df_genres = pd.DataFrame(index=np.arange(len(genres)), columns=['genre', 'lyrics'])
for i, genre in enumerate(genres):
    lyrics = []
    for j, row in df_bands.iterrows():
        if genre in row.genres:
            lyrics += row.lyrics
    df_genres.iloc[i] = dict(genre=genre, lyrics=lyrics)

In [36]:
df_genres['unique'] = df_genres.lyrics.apply(lambda x: len(set(x)) / len(x))

In [37]:
df_genres.sort_values('unique')[::-1]

Unnamed: 0,genre,lyrics,unique
4,folk,"[realized, life, meaning, purpose, quarry, ans...",0.192677
1,brutal,"[sordid, words, fallen, deaf, ears, mind, igno...",0.168402
14,technical,"[sordid, words, fallen, deaf, ears, mind, igno...",0.117733
0,black,"[human, animal, sadism, sexual, excitement, st...",0.116715
5,gothic,"[sfintit, roua, suferintii, lumea, preschimbã,...",0.113319
3,doom,"[ääni, tuulen, varjot, puiden, yksinäni, uneks...",0.112683
13,symphonic,"[foresee, ages, chaos, ordeal, yet, started, o...",0.110597
10,progressive,"[foresee, ages, chaos, ordeal, yet, started, o...",0.091516
8,melodic,"[skell, society, perceived, filtered, eyes, ju...",0.090355
6,groove,"[alice, frightening, alice, scared, alice, won...",0.077588
