# Word usage by multiple artists

It obtains all lyrics for a given set of artists (parses HTML from www.metrolyrics.com) and returns a comparision of the words used by the artists.
Packages required: requests | re | time |

In [1]:
import sys
sys.path.insert(0, '../src')
from lyricsFunctions import *

In [2]:
listOfArtists = ['Eddie Vedder', 'Dire Straits', 'Ed Sheeran']
songs4Artists = getSongs4Artists(listOfArtists) #by default (location = 'online')
#songs4Artists['eddie-vedder'] # Check artist songs found

Fetching eddie-vedder songs online ...
http://www.metrolyrics.com/eddie-vedder-lyrics.html
Fetching dire-straits songs online ...
http://www.metrolyrics.com/dire-straits-lyrics.html
Fetching ed-sheeran songs online ...
http://www.metrolyrics.com/ed-sheeran-lyrics.html
Done!


For each artist, collect all words across all song lyrics:

In [3]:
lyrics4Artists = getLyrics4Artists(songs4Artists, numSongs = 20) # by default (numSongs = 5 | location = 'online')

Fetching 20 lyrics for eddie-vedder ...
Fetching 20 lyrics for dire-straits ...
Fetching 20 lyrics for ed-sheeran ...
Done!


Word extraction from artists lyrics and model generation:

In [None]:
X, cv, vec, tf, labels, lyrics = tokenLyrics4Artists(lyrics4Artists, method='countVectorize')
m = buildNaiveBayesModel(X, labels)

What is the probability of each of the artist to write the following songs?

In [None]:
test_songs = [
      "I played the blues on twelve bars down on Lover's Lane",
      "I'll keep on healing all the scars That we've collected from the start",
      "So, baby, now Take me into your loving arms",
      "with a little help from my crocodile",
      "oh sweet love, what is this",
      "the beautiful people"]

prediction, classProb, logProb = proba_Lyrics4Artists(test_songs, m, cv, tf)


 Test songs might belong to:
['dire-straits' 'eddie-vedder' 'ed-sheeran' 'dire-straits' 'ed-sheeran'
 'eddie-vedder']

 Each song probability from being from each artist:
[[0.33333333 0.33333333 0.33333333]
 [0.19243111 0.26176122 0.54580766]
 [0.34739105 0.48218501 0.17042394]
 [0.33333333 0.33333333 0.33333333]
 [0.18398068 0.5970597  0.21895962]
 [0.2689354  0.32663697 0.40442763]]
[[-3.48106978 -3.79123309 -4.44502338 -4.07861287 -3.68494692 -3.43483466
  -4.33204056 -4.0201482  -4.42184004 -3.77976115 -3.41250767 -3.30932341
  -3.99825536 -4.22120126 -4.27774692 -3.64934928 -3.44799872 -3.74204074
  -4.11573846 -3.80013857 -3.49285035 -3.62030774 -4.13159718 -4.02748674
  -3.41040474 -3.60363609 -3.67119717 -4.13342596 -4.50152709 -3.71326611
  -4.01363759 -4.07872185 -4.26792197 -3.90221334 -3.4190417  -4.16032034
  -4.1878773  -4.04196922 -4.02552189 -3.6335321  -4.48072712 -4.03089658
  -4.19065933 -3.34341734 -3.70281448 -3.47108615]
 [-4.22008075 -3.46335631 -3.69767867 -3.5

# Which world is characteristic of each artist?

## Bag of Words

In [None]:
allWordsEddieVedder = " ".join(lyrics4Artists['eddie-vedder'])
allWordsDireStraits = " ".join(lyrics4Artists['dire-straits'])
allWordsEdSheeran = " ".join(lyrics4Artists['ed-sheeran']) 

In [None]:
import wordcloud
import matplotlib.pyplot as plt
plt.figure(num = None, figsize = (20,20))

wordcloud1 = wordcloud.WordCloud(background_color="white", max_words=2000, contour_color='steelblue').generate(allWordsEddieVedder)
plt.subplot(3,1,1)
plt.title('eddie-vedder', fontsize=18, loc='right')
plt.imshow(wordcloud1, interpolation='bilinear')

wordcloud2 = wordcloud.WordCloud(background_color="white", max_words=2000, contour_color='steelblue').generate(allWordsDireStraits)
plt.subplot(3,1,2)
plt.title('dire-straits', fontsize=16, loc='right')
plt.imshow(wordcloud2, interpolation='bilinear')

wordcloud3 = wordcloud.WordCloud(background_color="white", max_words=2000, contour_color='steelblue').generate(allWordsEdSheeran)
plt.subplot(3,1,3)
plt.title('ed-sheeran', fontsize=16, loc='right')
plt.imshow(wordcloud3, interpolation='bilinear')
plt.show()

<Figure size 2000x2000 with 3 Axes>

## Vocabulary that better charaterize the difference between artists

In [None]:
#### Latent Dirichlet Allocation
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
wordVectors4Artists = list(sorted(cv.vocabulary_.keys()))

lda = LatentDirichletAllocation(n_components=10)
lda.fit(vec)
c = lda.components_

ctrans = c.T

df = pd.DataFrame(ctrans, index=wordVectors4Artists)

for i in range(10):
    print(df[i].sort_values(ascending=False).head(20))

away       28.623457
just       17.386976
hand       16.769050
tonight    13.272100
know        9.263679
heart       8.266608
cause       6.947180
way         6.913296
home        5.985650
eyes        4.796726
old         4.605908
night       4.487066
look        4.099975
time        3.628698
ve          3.424421
love        2.641623
oh          0.756410
right       0.100731
got         0.100055
life        0.100046
Name: 0, dtype: float64
gonna     28.873492
don       23.916052
know      10.433055
right      8.956631
baby       8.713524
need       6.250706
ve         5.355179
day        5.150983
say        5.097462
oh         4.099984
yeah       4.042354
long       3.708463
let        3.594336
fall       3.100024
heart      2.333905
make       2.323333
got        2.104840
just       1.992104
people     1.679233
like       1.511514
Name: 1, dtype: float64
long       48.491444
feel       17.936929
love       16.719125
home       11.035059
say         6.862082
ve          5.414832
need  

In [None]:
### Principal components 

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

wordVectors4Artists = list(sorted(cv.vocabulary_.keys()))

pca = PCA(n_components=3)
logProb_T = np.transpose(logProb)
c = pca.fit_transform(logProb_T)
df = pd.DataFrame(c, index=wordVectors4Artists)
#print(df)
varianceDf = pd.DataFrame(pca.explained_variance_ratio_, index = ['variance_ratio PC-1','variance_ratio PC-2','variance_ratio PC-3'])
print(varianceDf)

for i in range(3):
    print("\n Principal component " + str(i + 1) + " holds "+ str(varianceDf.iloc[i,0]) + " of the variation \n")
    print(df[i].sort_values(ascending=False).head(20))


                            0
variance_ratio PC-1  0.392267
variance_ratio PC-2  0.311976
variance_ratio PC-3  0.295756

 Principal component 1 holds 0.3922672152301272 of the variation 

ll         0.937231
oh         0.871612
love       0.836068
know       0.787409
ve         0.633399
long       0.490924
need       0.428399
feel       0.406060
tonight    0.264659
like       0.235196
way        0.181211
eyes       0.161904
heart      0.161559
don        0.134586
let        0.094435
just       0.073936
free       0.050328
come       0.044471
time       0.018920
home      -0.019395
Name: 0, dtype: float64

 Principal component 2 holds 0.3119763314169067 of the variation 

love     0.981261
baby     0.721516
don      0.714739
home     0.564103
just     0.410880
come     0.326905
look     0.326494
night    0.305940
man      0.300805
oh       0.292593
cause    0.256917
right    0.241394
eyes     0.193917
time     0.145803
make     0.121843
old      0.091453
play     0.081566
like     0.069

# Obtain world similarity per artist
### Deep Learning using Word Embeddings

### 1. Gensim models:

In [None]:
analysis_gensimModels(lyrics4Artists['eddie-vedder'], 'love', 'right', 'wrong')


Distance between: 
love and right 0.03530192
love and wrong 0.2852222

Difference between right and wrong: -24.99%


In [None]:
analysis_gensimModels(lyrics4Artists['dire-straits'], 'love', 'right', 'wrong')


Distance between: 
love and right 0.69960564
love and wrong 0.22024722

Difference between right and wrong: 47.94%


In [None]:
analysis_gensimModels(lyrics4Artists['ed-sheeran'], 'love', 'right', 'wrong')


Distance between: 
love and right 0.99618876
love and wrong 0.9768212

Difference between right and wrong: 1.94%


# Pretrained GenSim Vectors
###### This is a 4 GB file that will be loaded to memory. It will require a lot of RAM.

In [None]:
import os
from keras.utils import get_file
import gensim
import subprocess

MODEL = 'GoogleNews-vectors-negative300.bin'
path = get_file(MODEL + '.gz', 'https://s3.amazonaws.com/dl4j-distribution/%s.gz' % MODEL)
if not os.path.isdir('generated'):
    os.mkdir('generated')

unzipped = os.path.join('generated', MODEL)
if not os.path.isfile(unzipped):
    with open(unzipped, 'wb') as fout:
        zcat = subprocess.Popen(['zcat'],
                          stdin=open(path),
                          stdout=fout
                         )
        zcat.wait()

model = gensim.models.KeyedVectors.load_word2vec_format(unzipped, binary=True)

Using TensorFlow backend.


In [None]:
import sys
a = sys.path.insert(0, '../../../rawData')
print(a)

In [None]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html

CODE FROM:
https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py
'''

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


BASE_DIR = '/Users/Magalangelo/Dropbox/_dataScienceRepos/portfolioDataScience/rawData'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, f'glove.6B.{EMBEDDING_DIM}d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
validation_data=(x_val, y_val))


names = [x[1] for x in sorted([(b, a) for a, b in labels_index.items()])]

def pred(s):
    s1 = tokenizer.texts_to_sequences([s])
    d1 = pad_sequences(s1, maxlen=MAX_SEQUENCE_LENGTH)
    lab = model.predict(d1)
    for nam, score in zip(names, lab[0]):
        if score > 0.1:
            print(f"{nam:25}\t{score:6.2f}")


pred("god is a spaghetti monster floating in space")