In [3]:
import pandas as pd
import numpy as np
import sklearn
import re
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

data = pd.read_pickle("darklyrics/clustered_lyrics.pkl")
original_data = pd.read_pickle("darklyrics/data_filtered_original.pkl")
km = joblib.load('darklyrics/doc_cluster_big.pkl')
original_data.head()

Unnamed: 0,album,artist,name,year,lyrics
3,Dissolve Into Nothingness,angara damana,Blind Sheep,2012,[Hopeless time Everything Falls Ideas Died One...
4,Dissolve Into Nothingness,angara damana,A New Age Of Darkness And Blood,2012,"[Somewhere Near An Empty Space, Where The Inau..."
5,Dissolve Into Nothingness,angara damana,Contradiction & Repetition,2012,"[In This Endless Hedge, Where Is The Way Of Sa..."
6,Dissolve Into Nothingness,angara damana,A Truth Hidden In The Womb,2012,[the troops of darkness fly high in the sky to...
7,Dissolve Into Nothingness,angara damana,The Ado Of Wisdom,2012,"[I Am Standing In A Bright Circle, I See An Of..."


In [4]:
# Genre and Topic Tags

topic_labels = {
    0: "Gangster",
    1: "Fatalism",
    2: "Epic",
    3: "Sorrow",
    4: "Violence",
    5: "Satanic",
    6: "Imperial",
    7: "Getting Over",
    8: "Party",
    9: "Mythological",
    10: "Religious",
    11: "Breakup",
    12: "Gore",
    13: "Ancient Depravation",
    14: "Viking",
    15: "Murder",
    16: "Hell",
    17: "Medieval",
    18: "Runes",
    19: "Cosmic Horror",
    20: "Ancient",
    21: "Occultism",
    22: "Cursing",
    23: "Kingdom",
    24: "Pornographic",
    25: "Hate",
    26: "Polish",
    27: "Nature & Sadness",
    28: "Kings & Battles",
    29: "Rotten"
}

genre_labels = {
    0: "Glam Metal",
    1: "Thrash Metal",
    2: "Folk Metal",
    3: "Hard Rock",
    4: "Metalcore",
    5: "Black Metal",
    6: "Epic Doom Metal",
    7: "Metalcore",
    8: "Glam Metal",
    9: "Power Metal",
    10: "Christian Metal",
    11: "Nu Metal",
    12: "Brutal Death Metal",
    13: "Depressive Black Metal",
    14: "Viking Metal",
    15: "Grindcore",
    16: "Doom Metal",
    17: "Black Metal",
    18: "Foreign Metal",
    19: "Death Metal",
    20: "Speed Metal",
    21: "Doom Metal",
    22: "Grindcore",
    23: "Power Metal",
    24: "Goregrind",
    25: "Metalcore",
    26: "Polish Metal",
    27: "Gothic Metal",
    28: "Power Metal",
    29: "Death Metal"
}

In [6]:
data['topic_label'] = data.apply(lambda x: topic_labels[x['cluster']], axis=1)
data['genre_label'] = data.apply(lambda x: genre_labels[x['cluster']], axis=1)
data = data.rename(columns={'cluster_labels': 'frequent_words'})
data.head()
data.to_pickle('darklyrics/clustered_lyrics.pkl')

In [5]:
# Build vocabulary of existing clusters
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=25, max_df=0.5)
x = vectorizer.fit_transform(data['lyrics_string'])
vocabulary = vectorizer.get_feature_names()

In [6]:
# New data
initial_lyrics = """
behold the vast and formless black below surges the human maelstrom swarming worthless 
breaths asphyxiate in themselves churches drown beneath our skyscrapers 
quelled by a vicious elite
a glorious caste of true industry
no gods, no time, no life
zenith of the universe
and yet, nadir of hell
mesmerize, entrancing metropolis
black psychedelia"""

new_data = {
    "artist": "new",
    "cluster": 0,
    "frequent_words": "",
    "x": 0,
    "y": 0
}

brackets_regex = re.compile('\[.*?\]')
apostrophe_regex = re.compile("[']")
punctuation_regex = re.compile('[^0-9a-zA-Z ]+')
double_space_regex = re.compile('\s+')

lyrics = initial_lyrics.lower()
lyrics = brackets_regex.sub("", lyrics)
lyrics = apostrophe_regex.sub("", lyrics)
lyrics = punctuation_regex.sub("", lyrics)
lyrics = double_space_regex.sub(" ", lyrics)

lyrics = lyrics.split()
lyrics = [word for word in lyrics if word in vocabulary]
lyrics = ' '.join(lyrics)

new_data['lyrics_string'] = lyrics
new_data = pd.DataFrame(new_data, index=[len(data)])
data = data.append(new_data)
data

Unnamed: 0,artist,cluster,cluster_labels,lyrics_string,x,y
0,'68,0,gimme maybe aint hey lying gonna shes class ba...,take heart next room make scene make sure aint...,-0.329639,0.565931
1,(sic)monic,1,shall existence self control reality fucking m...,revelation resurrections human blood leak woun...,-0.273161,-0.203549
2,+\-,27,shall wind stars shadows sorrow silent rain mo...,looked wide open sky window often nights augus...,-0.569364,-0.359816
3,...aaaarrghh...,18,ja ei ni da se kun ir si el na min en je ty er...,hi lmeyi diledin mi ruhunun derinliklerinden g...,-0.810511,-0.245691
4,...and oceans,27,shall wind stars shadows sorrow silent rain mo...,fanfare chants trumpets loud proclaim world ho...,-0.573531,0.149318
5,.calibre,19,shall fucking evil people eternal reality self...,breath hurricane breath hurricane breath hurri...,-0.406454,0.613611
6,.crrust,7,youve change forget save fucking wrong better ...,let try discomfort let others things well see ...,-0.621415,0.141845
7,100 demons,25,fucking fuck shit youve fucked better sick cha...,dont even tell already know fucking beast insi...,-0.041817,-0.541671
8,1000 funerals,17,thy shall thou thee lord thine ancient forth e...,didnt tell secrets seems fool unfaithful one c...,-0.596968,0.420211
9,11 as in adversaries,21,forest trees revolt ancient spirits spheres ar...,tell difference looking scene shooting stars f...,-0.731733,-0.382420


In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=25, max_df=0.5)
x = vectorizer.fit_transform(data['lyrics_string'])
features = vectorizer.get_feature_names()
predictions = km.predict(x)
predictions

array([ 0,  1, 27, ..., 27, 19, 19], dtype=int32)

In [8]:
cluster = predictions[-1]
cluster_labels = data[data.cluster == cluster].iloc[0]['frequent_words']
print cluster
print cluster_labels

19
shall fucking evil people eternal reality self existence gods escape shadows final control chaos stars minds destruction space endless future sea faith fuck void great grave sight suffering sorrow spirit


In [9]:
artist_sample = data[(data.cluster == cluster) & (data.artist != 'new')].sample(5)
artist_list = [row['artist'] for i, row in artist_sample.iterrows()]
print artist_list
song_sample = pd.DataFrame()
for i in range(0,5):
    song_sample = song_sample.append(original_data[original_data.artist == artist_list[i]].sample(1))
song_sample.head()

[u'audiomachy', u'maln\xe0tt', u'delusions of godhood', u'perdition', u'ignotus enthropya']


Unnamed: 0,album,artist,name,year,lyrics
44085,Whoopsy Stabby,audiomachy,Oscillicision,2011,[Various things were said but I don't think th...
6974,La Voce Dei Morti,malnàtt,I Felt A Funeral,2008,"[[Lyrics by Emily Dickinson], I felt a funeral..."
73936,Dreamscape,delusions of godhood,Through The Jaws Of Perdition,2015,"[[feat. Urban Granbacke], Beyond this mortal c..."
2099,This Wretched World,perdition,The Village,2012,"[Bring unto them their worst, They stand toget..."
3675,Solid Silver,ignotus enthropya,Extinction,2014,"[A world full of people, Crowded and diseased ..."


In [11]:
resulting_dict = song_sample.to_dict(orient='index')
resulting_dict

{2099: {u'album': u'This Wretched World',
  u'artist': u'perdition',
  'lyrics': [u'Bring unto them their worst',
   u'They stand together as one of their brotherhood',
   u'Fed from the food of their morals',
   u'Colonized and accepted ideals raised into society',
   u'A matter of time is only needed:',
   u'The village will collapse upon itself, a twisted colony of beautiful perfection ',
   u'[2x]',
   u'Corrupt by fake ideals and a faux path to safety ',
   u'[2x]',
   u'The village will bring destruction, branching outward towards every last crevice of the earth',
   u'How can you justify this? ',
   u'[2x]',
   u'The disgust of ego married with arrogance dressed in white',
   u'A grotesque picture praised by the village like a false idol',
   u'The village will bring violence',
   u'Violence will bring destruction',
   u'Destruction will bring your end',
   u'The village is you, composed of your ideals, composed of your lies',
   u'Composed of your ideals, composed of your lies.