In [1]:
import os
import re
import h5py
import numpy as np
import pandas as pd
import pickle as pkl
import multiprocessing
from scraper.metallum import Band, Album, Song

import nltk
from nltk.corpus import stopwords
stopdir = 'C:/Users/philippe/AppData/Roaming/nltk_data/corpora/stopwords'
languages = [x for x in os.listdir(stopdir) if x != 'README']
stop_words = set()
for language in languages:
    stop_words.update(set(stopwords.words(language)))
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

# All bands in common

In [12]:
LYRICS_DIR = 'darklyrics-data/'
BANDS_DIR = 'bands/'

band_filenames = [f for f in os.listdir(BANDS_DIR) if '.pkl' in f]
bands = [pkl.load(open(os.path.join(BANDS_DIR, f), 'rb')) for f in band_filenames]
genres = {b.name: b.genres for b in bands}

lyrics_filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
lyrics = {filename.replace('.pkl', ''): pkl.load(open(os.path.join(LYRICS_DIR, filename), 'rb')) for filename in lyrics_filenames}

band_names_metallum = [b.name for b in bands]
band_names_darklyrics = [b for b, _ in lyrics.items()]

band_names = [x for x in band_names_metallum if (x.lower() in band_names_darklyrics)]

rows = []
vocabulary = set()
for band_name in band_names:
    band_lyrics = lyrics[band_name.lower()]
    band_genre = genres[band_name]
    for album, album_lyrics in band_lyrics.items():
        for song, song_lyrics in album_lyrics.items():
            row = [band_name, album, song, song_lyrics, band_genre]
            rows.append(row)
df = pd.DataFrame(rows, columns=['band', 'album', 'song', 'lyrics', 'genres'])
df = df.drop_duplicates(subset=['band', 'song'])
print(df.shape)
df.head()

ModuleNotFoundError: No module named 'metallum2'

In [114]:
df.to_hdf('combined-data/song-lyrics-genres-raw.h5', key='df', mode='w')

# Bands with MA reviews

In [169]:
LYRICS_DIR = 'darklyrics-data/'
BANDS_DIR = 'bands/'

band_filenames = [f for f in os.listdir(BANDS_DIR) if '.pkl' in f]
bands = [pkl.load(open(os.path.join(BANDS_DIR, f), 'rb')) for f in band_filenames]
genres = {b.name: b.genres for b in bands}

lyrics_filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
lyrics = {filename.replace('.pkl', ''): pkl.load(open(os.path.join(LYRICS_DIR, filename), 'rb')) for filename in lyrics_filenames}

band_names_metallum = [b.name for b in bands]
band_names_darklyrics = [b for b, _ in lyrics.items()]

band_names = [x for x in band_names_metallum if x.lower() in band_names_darklyrics]

reviews = {b.name: b.albums.numrev.sum() for b in bands if (b.albums is not None) and (b.name in band_names)}

rows = []
vocabulary = set()
for band_name in band_names:
    band_lyrics = lyrics[band_name.lower()]
    band_genre = genres[band_name]
    band_reviews = reviews.get(band_name)
    if band_reviews is not None:
        for album, album_lyrics in band_lyrics.items():
            for song, song_lyrics in album_lyrics.items():
                row = [band_name, album, song, song_lyrics, band_genre, band_reviews]
                rows.append(row)
df = pd.DataFrame(rows, columns=['band', 'album', 'song', 'lyrics', 'genres', 'reviews'])
df = df.drop_duplicates(subset=['band', 'song'])
print(df.shape)
df.head()

(49864, 6)


Unnamed: 0,band,album,song,lyrics,genres,reviews
0,Aarni,Aarni,1. Myrrys,"[Ääni tuulen, varjot puiden, Yksinäni uneksin,...","[avant-garde, doom]",8
1,Aarni,Aarni,2. The Black Keyes (Of R'lyeh),[Five hundred years has gone since I summoned ...,"[avant-garde, doom]",8
2,Aarni,Aarni,3. Metsänpeitto Sample 1 - 2,"[Metsäpolulta Metsän polulle., Metsänpeitossa....","[avant-garde, doom]",8
3,Aarni,Aarni,4. Persona Mortuae Cutis,"['Yog-Sothoth est clavis portaque', Cum primis...","[avant-garde, doom]",8
4,Aarni,AARNI / UMBRA NIHIL,1. Ubbo-Sathla,[...For Ubbo-Sathla is the source and the end....,"[avant-garde, doom]",8


In [170]:
df.to_hdf('combined-data/song-lyrics-genres-reviews-raw.h5', key='df', mode='w')

# String parsing

In [20]:
def process_song_name(name):
    words = []
    for w in name.split():
        s = re.search("(\w+)|(\w+([-'])(\w+)?[']?(\w+))", re.sub('\d+\.', '', w).lower())
        if s:
            words.append(s.group())
    return ' '.join(words)

def process_song_lyrics(lyrics):
    words = []
    for line in lyrics:
        for w in line.split():
            s = re.search("(\w+)|(\w+([-'])(\w+)?[']?(\w+))", w.lower())
            if s:
                w = s.group()
                if w not in stop_words and len(w) > 2:
                    words.append(w)
    return ' '.join(words)

### All bands

In [35]:
df = pd.read_hdf('combined-data/song-lyrics-genres-raw.h5', key='df')
df['lyrics'] = df['lyrics'].apply(lambda x: '\n'.join(x))

# List of all genre labels
unique_genres = []
for lst in df['genres'].tolist():
    for x in lst:
        if (x not in stop_words) and (not re.match('\-.*', x)) and (x not in unique_genres):
            unique_genres.append(x)
unique_genres.sort()

# One-hot encoding of genres
df_genres_full = pd.DataFrame(index=df.index, columns=unique_genres)
for genre in unique_genres:
    df_genres_full[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)
df_genres_full['band'] = df['band'].copy()
df_genres_bands = df_genres_full.groupby('band').max()
df_genres_full.drop('band', axis=1, inplace=True)
top_genres = df_genres_full.columns[df_genres_full.mean(axis=0) > 0.15]
df_genres = df_genres_full[top_genres].copy()

# Combine into a dataframe with just lyrics and genres
df = pd.concat((df.lyrics, df_genres), 1)
df.to_hdf('combined-data/lyrics-genres.h5', key='df', mode='w')

### Bands with MA reviews

In [178]:
df_raw = pd.read_hdf('combined-data/song-lyrics-genres-reviews-raw.h5', key='df', mode='r')
df = df_raw.copy()
df.band = df.band.str.lower()
df.album = df.album.str.lower()
df.song = df.song.apply(process_song_name)
# df.lyrics = df.lyrics.apply(process_song_lyrics)
df.head()

Unnamed: 0,band,album,song,lyrics,genres,reviews
0,aarni,aarni,myrrys,"[Ääni tuulen, varjot puiden, Yksinäni uneksin,...","[avant-garde, doom]",8
1,aarni,aarni,the black keyes of r,[Five hundred years has gone since I summoned ...,"[avant-garde, doom]",8
2,aarni,aarni,metsänpeitto sample 1 2,"[Metsäpolulta Metsän polulle., Metsänpeitossa....","[avant-garde, doom]",8
3,aarni,aarni,persona mortuae cutis,"['Yog-Sothoth est clavis portaque', Cum primis...","[avant-garde, doom]",8
4,aarni,aarni / umbra nihil,ubbo,[...For Ubbo-Sathla is the source and the end....,"[avant-garde, doom]",8


In [180]:
df.to_hdf('combined-data/song-lyrics-genres-reviews.h5', key='df', mode='w')