In [1]:
import pandas as pd
import numpy as np
import sklearn
import nltk
import re
import os
import codecs
import mpld3
import matplotlib.pyplot as plt
import seaborn
import pylab
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data and pick a sample of 10%
data = pd.read_pickle("darklyrics/artist_data_stemmed.pkl")
data = data.sample(frac=0.1)

  help="""Generate default config file."""
  help="Specify a config file to load."
  help="""Full path of a config file.""",
  help="""Answer yes to any prompts."""
  user = Bool(False, config=True, help="Whether to do a user install")
  sys_prefix = Bool(False, config=True, help="Use the sys.prefix as the prefix")
  python = Bool(False, config=True, help="Install from a Python package")
  verbose = Any(None, config=True, help="DEPRECATED: Verbosity level")
  overwrite = Bool(False, config=True, help="Force overwrite of existing files")
  symlink = Bool(False, config=True, help="Create symlinks instead of copying files")
  prefix = Unicode('', config=True, help="Installation prefix")
  help="Full path to nbextensions dir (probably use prefix or user)")
  destination = Unicode('', config=True, help="Destination for the copy or symlink")
  prefix = Unicode('', config=True, help="Installation prefix")
  nbextensions_dir = Unicode('', config=True, help="Full path to nbextensions dir (proba

## TF-IDF Vectorization

In [2]:
begin = time()
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=2)
x = vectorizer.fit_transform(data.values)
end = time()
print 'TFIDF vectorization performed in %f seconds' % (end - begin)
print 'Num samples: %d, num features: %d' % x.shape
features = vectorizer.get_feature_names()

TFIDF vectorization performed in 1.823620 seconds
Num samples: 743, num features: 14731


## Clustering

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
distance = 1 - cosine_similarity(x)

In [4]:
km = KMeans(n_clusters=10)
begin = time()
km.fit(x)
end = time()
print 'K means computed in %f' % (end - begin)

K means computed in 3.868077


In [5]:
from sklearn.externals import joblib

#joblib.dump(km, 'darklyrics/doc_cluster.pkl')
#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [6]:
data2 = data.to_frame()
data2['cluster'] = clusters
data2 = data2.reset_index()
data2['cluster'].value_counts()

8    194
7    144
6    140
5     95
0     79
4     40
3     24
1     15
2      9
9      3
Name: cluster, dtype: int64

In [8]:
print "Top terms per cluster:"
print ""

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(10):
    print "Cluster %d words: " % i
    print ""
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print ' %s' % features[ind].encode('utf-8', 'ignore'),
    print ""
    print ""
    
    print "Cluster %d artists:" % i
    for artist_index, row in data2[data2['cluster'] == i].iterrows():
        print ('%s,' % row['artist']),
    print ""
    print ""

Top terms per cluster:

Cluster 0 words: 

 dont  like  know  come  time  feel  oh  metal  night  love 

Cluster 0 artists:
palisades, the taste of blood, side winder, heidevolk, acrimony, whitecross, tersivel, down to hell, wild dogs, tuomas holopainen, earth, fight back, le grand guignol, pyrithion, dolorian, wrecking tanganyika, umbra nihil, five psychosis, rhymes of destruction, dynabyte, abominable putridity, phoenix rising, pitbulls in the nursery, metal king, garden of eden, versus affect, vectom, chordewa, steelheart, crashed souls, agents of oblivion, bodies in the gears of the apparatus, letters from the colony, jack slater, riot games, butterfly temple, matanzick, sofa king killer, intoxxxicated, primate, the luminary, author & punisher, barroquejón, orgone, monarch, upsidedown cross, :fjoergyn:, mystic forest, cryfemal, terra mourn, wardruna, stahlhammer, saltus, avsky, divine intervention, fallenlight, muro, crawler, a tree of signs, raising the veil, mare, ajattara, kuole

In [9]:
# http://www.metalstorm.net/users/list.php?list_id=2413
top_bands = [
    "black sabbath", "judas priest", "tool", "korn", "slipknot", "iron maiden", "metallica", "venom", "motorhead",
    "slayer", "celtic frost", "bathory", "helloween", "possessed", "death", "pantera", "godflesh", "morbid angel",
    "paradise lost", "at the gates", "mercyful fate", "accept", "melvins", "budgie", "queensryche", "kreator",
    "napalm death", "gorguts", "sepultura", "dream theater", "mayhem", "yngwie malmsteen", "manowar", "candlemass",
    "watchtower", "burzum", "emperor", "atheist", "repulsion", "pentagram", "trouble", "saint vitus", "entombed",
    "obituary", "neurosis", "ministry", "suffocation", "diamond head", "anthrax", "darkthrone", "megadeth",
    "autopsy", "savatage", "fates warning", "suicidal tendencies", "carcass", "deicide", "exodus", "amorphis",
    "fear factory", "immortal", "sodom", "my dying bride", "the gathering", "theatre of tragedy", "dark angel",
    "the 3ds and the mortal", "saxon", "thergoton", "earth", "opeth", "isis", "sleep", "blind guardian", "therion",
    "in flames", "d.r.i", "meshuggah", "rhapsody of fire", "stratovarius", "manilla road", "destruction", "sarcofago",
    "tiamat", "nightwish", "children of bodom", "enslaved", "dissection", "dark tranquility", "masters hammer", 
    "master", "cannibal corpse", "nocturnus", "crimson glory", "anathema", "bethlehem", "cathedral", "ulver",
    "katatonia", "terrorizer", "w.a.s.p", "raven", "anvil", "dio", "gamma ray", "ozzy osbourne", "skyclad", "cynic",
    "type o negative", "skepticism", "disembowelment", "danzig", "graveland", "rotting christ", "sentenced",
    "white zombie", "immolation", "bolt thrower", "exhumed", "angel witch", "stormtroopers of death", "winter",
    "vulcano", "corrosion of conformity", "king diamond", "machine head", "eyehategod", "crowbar", "carnivore",
    "exhorder", "throns", "unholy", "prong", "testament", "overkill", "slaughter", "massacre", "voivod",
    "hammerfall", "dimmu borgir", "cradle of filth", "finntroll", "satyricon", "in the woods...", "mastodon",
    "witchfinder general", "running wild", "metal church", "marduk", "samael", "strapping young lad", 
    "symphony x", "nile", "summoning"
]