In [10]:
import os
import re
import numpy as np
import pandas as pd
import pickle as pkl

import nltk
from nltk.corpus import stopwords
languages = ['english', 'german', 'french', 'finnish', 'swedish', 'norwegian', 'danish', 'russian']
stop_words = set()
for language in languages:
    stop_words.update(set(stopwords.words(language)))
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

In [11]:
LYRICS_DIR = 'lyrics/'
BANDS_DIR = 'bands/'

In [13]:
def getwords(filename):
    with open(filename, 'rb') as f:
        lyrics = pkl.load(f)
    words = []
    for album, album_lyrics in lyrics.items():
        for song, song_lyrics in album_lyrics.items():
            for line in song_lyrics:
                for w in line.split():
                    s = re.search("(\w+)|(\w+([-'])(\w+)?[']?(\w+))", w.lower())
                    if s:
                        w = s.group()
                        if w in english_vocab and w not in stop_words:
                            words.append(w)
    return words


def getworddict(filenames, lyrics_dir=LYRICS_DIR):
    words_dict = {}
    for filename in filenames:
        words = getwords(os.path.join(lyrics_dir, filename))
        words_dict[filename.replace('.pkl', '')] = words
    return words_dict


def vectorize(corpus, vocabulary, stop_words=stop_words):
    vectorizer = CountVectorizer(stop_words=stop_words, vocabulary=vocabulary)
    vectorizer.fit(corpus)
    return vectorizer


def lyrics_to_df(vectorizer, filenames, lyrics_dir=LYRICS_DIR):
    bands = [f.replace('.pkl', '') for f in filenames]
    corpus = [' '.join(getwords(os.path.join(LYRICS_DIR, f))) for f in filenames]
    X = vectorizer.transform(corpus).toarray()
    df = pd.DataFrame(X, index=bands, columns=vectorizer.vocabulary)
    return df


def lyrics_to_df(words_dict, band_names):
    vocabulary = sorted(set([w for words in words_dict.values() for w in words]))
    df = pd.DataFrame(index=band_names, columns=vocabulary)
    for i, b in enumerate(band_names):
        words = words_dict[b]
        tf = nltk.FreqDist(words)
        df.loc[b] = [tf.get(v, 0) for v in vocabulary]
    return df


def bands_to_df(bands):
    band_genres = [(b.name.lower(), b.genres) for b in bands]
    genres = [g for b in bands for g in b.genres]
    genres = sorted(set(g for g in genres if genres.count(g) > 20))
    df = pd.DataFrame(index=[b.name.lower() for b in bands], columns=['genre_' + g for g in genres])
    for i, (b, b_genres) in enumerate(band_genres):
        df.loc[b] = np.in1d(np.array(genres), np.array(b_genres)).astype(int)
    return df

In [14]:
filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
words_dict = getworddict(filenames)
bands_with_lyrics = [f.replace('.pkl', '') for f in filenames]
documents = list(words_dict.values())
corpus = [' '.join(d) for d in documents]
vocabulary = sorted(set([w for words in documents for w in words]))
vectorizer = CountVectorizer(stop_words=stop_words, vocabulary=vocabulary)
vectorizer.fit(corpus)
pkl.dump(vectorizer, open('vectorizer.pkl', 'wb'))
df = pd.DataFrame(vectorizer.transform(corpus).toarray(), index=bands_with_lyrics, columns=vectorizer.vocabulary)

In [4]:
# filenames = [f for f in os.listdir(LYRICS_DIR) if '.pkl' in f]
# bands_with_lyrics = [f.replace('.pkl', '') for f in filenames]
# words_dict = getworddict(filenames)
# df = lyrics_to_df(words_dict, bands_with_lyrics)
# vocabulary = list(df.columns)

### Test

In [70]:
vectorizer = pkl.load(open('vectorizer.pkl', 'rb'))
example = [
    'blah blah blah these are fake metal lyrics... death death death burn'
]
df_example = pd.DataFrame(vectorizer.transform(example).toarray(), columns=vectorizer.vocabulary)
df_example

In [40]:
band_filenames = [f for f in os.listdir(BANDS_DIR) if '.pkl' in f]
bands = [pkl.load(open(os.path.join(BANDS_DIR, f), 'rb')) for f in band_filenames]
bands = [b for b in bands if b.name.lower() in bands_with_lyrics]
df_genres = bands_to_df(bands)

In [43]:
df_combined = pd.concat((df.loc[df_genres.index], df_genres), axis=1)
df_combined.head(10)

Unnamed: 0,aa,aal,aam,aardvark,aaron,aaru,ab,aba,abacinate,abacus,...,genre_groove,genre_heavy,genre_melodic,genre_power,genre_progressive,genre_rock,genre_speed,genre_symphonic,genre_technical,genre_thrash
riverside,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
root,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
darkestrah,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
deafheaven,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
conan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
astarte,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
nachtmystium,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
halford,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
lugubrum,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
rudra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [57]:
df_combined.to_csv('data4.csv')