In [2]:
import os
import re
import numpy as np
import pandas as pd
import pickle as pkl
from metallum import Band, Album, Song

from nltk.corpus import stopwords
languages = ['english', 'german', 'french', 'finnish', 'swedish', 'norwegian', 'danish', 'russian']
stop_words = set()
for language in languages:
    stop_words.update(set(stopwords.words(language)))
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [10]:
LYRICS_DIR = 'lyrics/'
BANDS_DIR = 'bands/'

In [59]:
filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f][:10]
lyrics = {filename.replace('.pkl', ''): pkl.load(open(os.path.join(LYRICS_DIR, filename), 'rb')) for filename in filenames}
corpus = []
band_names = []
song_names = []
for band, band_lyrics in lyrics.items():
    for album, album_lyrics in band_lyrics.items():
        for song, song_lyrics in album_lyrics.items():
            words = []
            for line in song_lyrics:
                for w in line.split():
                    s = re.search("(\w+)|(\w+([-'])(\w+)?[']?(\w+))", w.lower())
                    if s:
                        w = s.group()
                        if True: #w in english_vocab and w not in stop_words:
                            words.append(w)
            document = ' '.join(words)
            if len(document) > 0:
                corpus.append(document)
                band_names.append(band)
                song_names.append(song)

In [60]:
max([len(doc.split()) for doc in corpus])

482

In [66]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
text_vocab = set(w.lower() for w in corpus[0].split() if w.lower().isalpha())
unusual = text_vocab.difference(english_vocab) 

In [61]:
for i in range(len(corpus)):
    if len(corpus[i].split()) > 200:
        print(band_names[i], song_names[i])
        print(corpus[i])
        print()

aarni 4. Liber Umbrarum Vel Coniunctio
timeo trepido in basilicis atris erro nescio nememini quo modo huc adveni librum iuvenis quiete domi legebam sed qui estis spiritus advenientes lemures aut spiritus non sumus nonne nos cognoscis liber sanctus sum draconigenus sum et feminea aeterna sum certe vos cognosco in somnis mihi torquetis mehercule abite mali spiritus redite ad infernos magnopere me temptatis cur mi apparuistis quid vobis mecum rei est tibi apparuimus ut completus fias consanguinei sumus tu et nos intellectu careo venitisne ex animo meo ita vero de te sumus partes assume nos et invenies nunc intellego coniuncti sumus timorous i tremble lost in murky hallways i know not nor recollect how i came to be here i was perusing the book of the youth in the quiet of my home but who are you approaching spirits lemures or spirits we are not dost thou not recognise us i am the sacred child i am dragon i am the eternally feminine verily i recognise you the tormentors from my dreams by he

In [20]:
def getwords(filename):
    with open(filename, 'rb') as f:
        lyrics = pkl.load(f)
    words = []
    for album, album_lyrics in lyrics.items():
        for song, song_lyrics in album_lyrics.items():
            for line in song_lyrics:
                for w in line.split():
                    s = re.search("(\w+)|(\w+([-'])(\w+)?[']?(\w+))", w.lower())
                    if s:
                        w = s.group()
                        if w in english_vocab and w not in stop_words:
                            words.append(w)
    return words


def getworddict(filenames, lyrics_dir=LYRICS_DIR):
    words_dict = {}
    for filename in filenames:
        words = getwords(os.path.join(lyrics_dir, filename))
        words_dict[filename.replace('.pkl', '')] = words
    return words_dict


def vectorize(corpus, vocabulary, stop_words=stop_words):
    vectorizer = CountVectorizer(stop_words=stop_words, vocabulary=vocabulary)
    vectorizer.fit(corpus)
    return vectorizer


def lyrics_to_df(vectorizer, filenames, lyrics_dir=LYRICS_DIR):
    bands = [f.replace('.pkl', '') for f in filenames]
    corpus = [' '.join(getwords(os.path.join(LYRICS_DIR, f))) for f in filenames]
    X = vectorizer.transform(corpus).toarray()
    df = pd.DataFrame(X, index=bands, columns=vectorizer.vocabulary)
    return df


def lyrics_to_df(words_dict, band_names):
    vocabulary = sorted(set([w for words in words_dict.values() for w in words]))
    df = pd.DataFrame(index=band_names, columns=vocabulary)
    for i, b in enumerate(band_names):
        words = words_dict[b]
        tf = nltk.FreqDist(words)
        df.loc[b] = [tf.get(v, 0) for v in vocabulary]
    return df


def bands_to_df(bands):
    band_genres = [(b.name.lower(), b.genres) for b in bands]
    genres = [g for b in bands for g in b.genres]
    genres = sorted(set(g for g in genres if genres.count(g) > 20))
    df = pd.DataFrame(index=[b.name.lower() for b in bands], columns=['genre_' + g for g in genres])
    for i, (b, b_genres) in enumerate(band_genres):
        df.loc[b] = np.in1d(np.array(genres), np.array(b_genres)).astype(int)
    return df

In [21]:
filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
words_dict = getworddict(filenames)
bands_with_lyrics = [f.replace('.pkl', '') for f in filenames]
documents = list(words_dict.values())
corpus = [' '.join(d) for d in documents]
vocabulary = sorted(set([w for words in documents for w in words]))
vectorizer = CountVectorizer(stop_words=stop_words, vocabulary=vocabulary)
vectorizer.fit(corpus)
pkl.dump(vectorizer, open('vectorizer.pkl', 'wb'))
df = pd.DataFrame(vectorizer.transform(corpus).toarray(), index=bands_with_lyrics, columns=vectorizer.vocabulary)

KeyboardInterrupt: 

In [4]:
# filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
# bands_with_lyrics = [f.replace('.pkl', '') for f in filenames]
# words_dict = getworddict(filenames)
# df = lyrics_to_df(words_dict, bands_with_lyrics)
# vocabulary = list(df.columns)

### Test

In [13]:
vectorizer = pkl.load(open('vectorizer.pkl', 'rb'))
example = [
    'blah blah blah these are fake metal lyrics... death death death burn'
]
df_example = pd.DataFrame(vectorizer.transform(example).toarray(), columns=vectorizer.vocabulary)
df_example

Unnamed: 0,aa,aal,aam,aardvark,aaron,aaru,ab,aba,abacinate,abacus,...,zoophile,zoophilism,zoopsia,zounds,zulu,zuni,zwitter,zygoma,zygote,zymotic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
band_filenames = [f for f in os.listdir(BANDS_DIR) if '.pkl' in f]
bands = [pkl.load(open(os.path.join(BANDS_DIR, f), 'rb')) for f in band_filenames]
bands = [b for b in bands if b.name.lower() in bands_with_lyrics]
df_genres = bands_to_df(bands)

In [16]:
df_combined = pd.concat((df.loc[df_genres.index], df_genres), axis=1)
df_combined.head(10)

Unnamed: 0,aa,aal,aam,aardvark,aaron,aaru,ab,aba,abacinate,abacus,...,genre_groove,genre_heavy,genre_melodic,genre_power,genre_progressive,genre_rock,genre_speed,genre_symphonic,genre_technical,genre_thrash
aarni,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abigail,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abigor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abnormality,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
aborted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aborym,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abscess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absurd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abyssal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_combined.to_csv('data.csv')