In [131]:
import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import words as nltk_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import wordcloud

In [12]:
# stopwords_dir = 'C:Users/philippe/AppData/Roaming/nltk_data/corpora/stopwords/'
stopwords_dir = '/home/philippe/nltk_data/corpora/stopwords'
files = os.listdir(stopwords_dir)
languages = [file for file in files if file != 'README']
stop_words = set()
for language in languages:
    stop_words.update(set(stopwords.words(language)))

In [172]:
def get_common_surface_form(original_corpus, stemmer):
    from collections import defaultdict
    counts = defaultdict(lambda : defaultdict(int))
    surface_forms = {}
    for document in original_corpus:
        for token in document:
            stemmed = stemmer.stem(token)
            counts[stemmed][token] += 1
    for stemmed, originals in counts.items():
        surface_forms[stemmed] = max(originals, key=lambda i: originals[i])
    return surface_forms

In [228]:
df = pd.read_hdf('combined-data/lyrics-genres.h5', key='df', mode='r')
df = df[df.lyrics.apply(len) > 0]

# drop non-English songs
english_words = set(nltk_words.words())
def pct_english(x):
    return len(set(x).intersection(english_words)) / len(set(x))
num_english_words = df.lyrics.str.split().apply(pct_english)
df = df[num_english_words > 0.5]

# split into genre documents
df['other'] = (df[df.columns[1:]].sum(axis=1) == 0).astype(int)
genres = df.columns[1:]
genre_texts = [' '.join(df[df[genre] == 1].lyrics.values) for genre in genres]
# genre_texts = [' '.join(df[(df[genre] == 1) & (df[genres].sum(axis=1) == 1)].lyrics.values) for genre in genres]

In [None]:
stemmer = PorterStemmer()
stemmed_corpus = []
original_corpus = []
for text in genre_texts:
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(token) for token in tokens]
    stemmed_corpus.append(stemmed)
    original_corpus.append(tokens)
dictionary = Dictionary(stemmed_corpus)
vectors = [dictionary.doc2bow(text) for text in stemmed_corpus]
vectors = [[bow_repr for bow_repr in vector if bow_repr[1] > 100] for vector in vectors]
tfidf = TfidfModel(vectors, wglobal=lambda x1, x2: np.log2((1 + x2) / (1 + x1)))
surface_forms = get_common_surface_form(original_corpus, stemmer)
for i, genre in enumerate(genres):
    print(genre)
    top_words = np.sort(np.array(tfidf[vectors[i]], dtype = [('word', int), ('score', float)]), order='score')[::-1]
    frequencies = [(surface_forms[dictionary[word]],score) for word,score in top_words]
    wc = wordcloud.WordCloud(stopwords=sorted(stop_words)).generate_from_frequencies(dict(frequencies))
    plt.figure(figsize=(8, 5))
    plt.imshow(wc)
    plt.axis('off')
    plt.show()