## data collection and preprocessing



In [1]:
% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# We do this to ignore several specific Pandas warnings
import warnings
import requests
import json
warnings.filterwarnings("ignore")

### Extract data from Million Song Subset which is 1.8G in size

In [2]:
import os, sys

dir_tree = 'C:/Users/hp/Desktop/Music-Mood-Detection-using-Text-Mining-on-Lyrics--master/text mining on music lyrics/data/MSM'

for dir_path, dir_names, file_names in os.walk(dir_tree):
    for file_name in file_names:
        try:
            os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
        except OSError:
            print ("Could not move %s " % os.join(dir_path, file_name))

### Build an artist table with file,title, artist columns

In [3]:
def make_artist_table(base):

# Get file names

    files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
    data = {'file':[], 'artist':[], 'title':[]}

    # Add artist and title data to dictionary
    for f in files:
        store = pd.HDFStore(f)
        title = store.root.metadata.songs.cols.title[0]
        artist = store.root.metadata.songs.cols.artist_name[0]
        data['file'].append(os.path.basename(f))
        data['title'].append(title.decode("utf-8"))
        data['artist'].append(artist.decode("utf-8"))
        store.close()
    
    # Convert dictionary to pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='columns')
    df = df[['file', 'artist', 'title']]
    return df

In [4]:
base = 'C:/Users/hp/Desktop/Music-Mood-Detection-using-Text-Mining-on-Lyrics--master/text mining on music lyrics/data/MSM'
df = make_artist_table(base)

df.tail()

Unnamed: 0,file,artist,title
259,TRBIFNO128F42702FB.h5,Tha Liks,Intro
260,TRBIFOK128EF362D20.h5,The Pharcyde,I'm That Type Of Ni**a
261,TRBIFPI12903CEAFB6.h5,John D. Loudermilk,Somebody Sweet
262,TRBIFPX12903CCF859.h5,Cornell Campbell,Just My Imagination
263,TRBIFUD128F1495AE2.h5,Hot Boys,Young Riders


### Add the lyrics column

In [5]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()

Unnamed: 0,file,artist,title,lyrics
259,TRBIFNO128F42702FB.h5,Tha Liks,Intro,
260,TRBIFOK128EF362D20.h5,The Pharcyde,I'm That Type Of Ni**a,
261,TRBIFPI12903CEAFB6.h5,John D. Loudermilk,Somebody Sweet,
262,TRBIFPX12903CCF859.h5,Cornell Campbell,Just My Imagination,
263,TRBIFUD128F1495AE2.h5,Hot Boys,Young Riders,


### download the PyLyrics package to download lyrics from the website

In [6]:
#!pip install PyLyrics

In [7]:
from PyLyrics import *

In [8]:
 from PyLyrics import *
## test this function
print(PyLyrics.getLyrics('justin bieber','Sorry')) #Print the lyrics directly

You gotta go and get angry at all of my honesty
You know I try but I don't do too well with apologies
I hope I don't run out of time, could someone call the referee?
'Cause I just need one more shot at forgiveness

I know you know that I made those mistakes maybe once or twice
And by once or twice I mean maybe a couple a hundred times
So let me, oh let me redeem, oh redeem, oh myself tonight
'Cause I just need one more shot, second chances

Is it too late now to say sorry?
'Cause I'm missing more than just your body, oh
Is it too late now to say sorry?
Yeah, I know that I let you down
Is it too late to say I'm sorry now?

I'm sorry, yeah
Sorry, yeah
Sorry
Yeah, I know that I let you down
Is it too late to say I'm sorry now?

I'll take every single piece of the blame if you want me to
But you know that there is no innocent one in this game for two
I'll go, I'll go and then you go, you go out and spill the truth
Can we both say the words and forget this?

Yeah, is it too late now to say 

In [9]:
#!pip install pyprind

In [10]:
import pyprind

### download lyrics with the arguments of artist and track name

In [11]:
tpbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    try:
        lyr = PyLyrics.getLyrics(df.loc[row_id]['artist'],df.loc[row_id]['title'])    
        df.loc[row_id,'lyrics'] = lyr
        pbar.update()
    except: #ignore erro when API returns no lyrics 
        continue


In [12]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()

downloaded Lyrics for 88 songs


Unnamed: 0,file,artist,title,lyrics
0,TRAAAAW128F429D538.h5,Casual,I Didn't Mean To,Verse One:\n\nAll right I might\nHave had a li...
1,TRAAABD128F429CF47.h5,The Box Tops,Soul Deep,"Darling, I don't know much\nBut I know I love ..."
2,TRAAADZ128F9348C2E.h5,Sonora Santanera,Amor De Cabaret,
3,TRAAAEF128F4273421.h5,Adam Ant,Something Girls,Adam Ant/Marco Pirroni\nEvery girl is a someth...
4,TRAHXZJ128F930C784.h5,Seamus Egan,McDermott's Fancy / Swans Among The Rushes / C...,


In [13]:
df.to_csv('df_lyr_backup.csv')

### drop rows that has no lyrics

In [14]:
df = df[df.lyrics!='']

### remove songs that is not English song

In [15]:
import nltk
#nltk.download('words')
def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    return diff

In [16]:
before = df.shape[0]
for row_id in df.index:
    text = df.loc[row_id]['lyrics']
    diff = eng_ratio(text)
    if diff >= 0.5:
        df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)

9 have been removed.
79 songs remain in the dataset.


### Till now, we got the songs with lyrics, but we have to tag each song with mood. Here I download the tags from Last.fm and classified each some with happy mood or sad mood

In [17]:
def getSongTags(artist,track):
    url = "http://ws.audioscrobbler.com/2.0/?method=track.getTopTags&api_key=0f6916aff634cb3e768baa9d5ee89341&artist="+artist+"&track="+track+"&format=json"
#     print(url)
    results = requests.get(url).json()
#     print(results)
    tagList = []
    if 'toptags' in results:
        toptags = results['toptags']
        if 'tag' in toptags:
            taglistss = toptags['tag']           
            for tagItem in taglistss:
                tagList.append(tagItem['name']) 
    return tagList

In [18]:
df['tags'] = ''
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    tags = getSongTags(df.loc[row_id]['artist'],df.loc[row_id]['title'])    
    df.loc[row_id,'tags'] = tags
    pbar.update()
    


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:01


In [20]:
getSongTags("The Weeknd","Call Out My Name")

['rnb',
 'soul',
 'urban',
 'r&b',
 'trap',
 'alternative rnb',
 'sad',
 'Love',
 'Canadian',
 'love at first listen',
 'Selena Gomez',
 'heartbreaker',
 'The Weeknd',
 'Selena',
 '2018',
 '2018 single',
 'obrigado pelos mimos selena',
 'selena amorr']

In [21]:
df['tags']
# df.head()

0      [Bay Area, hieroglyiphics, Hip-Hop, classic, h...
1      [60s, soul, pop, rock, oldies, 1969, classic r...
3      [new wave, 80s, girls, freedom, strangeromanti...
5                                      [guitar virtuoso]
6      [blues, delta blues, mississippi, high and low...
11                 [rhcp, Genialne, Ive Sung At Karaoke]
12                                                    []
14     [jazz, latin, cover, lounge, summer, covers, e...
21                    [underground hip hop, likwit crew]
29     [pop, rnb, jennifer lopez, dance, latin, femal...
33                             [christian rock, worship]
35     [loved, Southern Rock, post-hardcore, southern...
38                                 [Contemporary Gospel]
44                                                    []
47     [female vocalists, pop, acoustic, indie, singe...
48     [covers, cover, insane, heard on Pandora, good...
59     [post-hardcore, alternative rock, rock, powerf...
61     [blues, rock n roll, cou

In [22]:
for row_id in df.index:     
    if len(df.loc[row_id,'tags'])==0:
        df = df.drop(row_id)
    

In [24]:
df['year'] = pd.Series('', index=df.index)

base = 'C:/Users/hp/Desktop/Music-Mood-Detection-using-Text-Mining-on-Lyrics--master/text mining on music lyrics/data/MSM'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    df.loc[row_id]['year'] = year

<img src="image.png">

In [26]:
df.shape[0]

65

## Happy or Sad
### Group id	Tags	num. of tags	num. of songs
#### sad tags:

G15	sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song	8	1,178

G16	depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy	11	471

G28	anger, angry, choleric, fury, outraged, rage, angry music	7	254

G17	grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning, plaintive, regret, sorrowful	14	183

#### happy tags:
G6	cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering, cheery, get happy, rejoice, songs that are cheerful, sunny	13	142

G5	happy, happiness, happy songs, happy music, glad, mood: happy	6	749

G2	upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat	8	543

G1	excitement, exciting, exhilarating, thrill, ardor, stimulating, thrilling, titillating	8	30
TOTAL		135	6,490

### This tag summary comes from the last.fm website which were group into different categories. Here, I choose group 15,16,28,17 as sad tag and group 5,6,2,1 as happy songs

In [27]:
happyTags = "cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering,\
cheery, get happy, rejoice, songs that are cheerful, sunny,happy, happiness, happy songs, happy music, glad, mood: happy,\
upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat,excitement, exciting, exhilarating, thrill,\
ardor, stimulating, thrilling, titillating"
happyTags = happyTags.replace(" ","").split(",")

sagTags = "sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song,\
depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy,\
anger, angry, choleric, fury, outraged, rage, angry music,grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning,\
plaintive, regret, sorrowful"
sagTags = sagTags.replace(" ","").split(",")

In [28]:
happyTags

['cheerful',
 'cheerup',
 'festive',
 'jolly',
 'jovial',
 'merry',
 'cheer',
 'cheering',
 'cheery',
 'gethappy',
 'rejoice',
 'songsthatarecheerful',
 'sunny',
 'happy',
 'happiness',
 'happysongs',
 'happymusic',
 'glad',
 'mood:happy',
 'upbeat',
 'gleeful',
 'highspirits',
 'zest',
 'enthusiastic',
 'buoyancy',
 'elation',
 'mood:upbeat',
 'excitement',
 'exciting',
 'exhilarating',
 'thrill',
 'ardor',
 'stimulating',
 'thrilling',
 'titillating']

In [29]:
sagTags

['sad',
 'sadness',
 'unhappy',
 'melancholic',
 'melancholy',
 'feelingsad',
 'mood:sad-slightly',
 'sadsong',
 'depressed',
 'blue',
 'dark',
 'depressive',
 'dreary',
 'gloom',
 'darkness',
 'depress',
 'depression',
 'depressing',
 'gloomy',
 'anger',
 'angry',
 'choleric',
 'fury',
 'outraged',
 'rage',
 'angrymusic',
 'grief',
 'heartbreak',
 'mournful',
 'sorrow',
 'sorry',
 'doleful',
 'heartache',
 'heartbreaking',
 'heartsick',
 'lachrymose',
 'mourning',
 'plaintive',
 'regret',
 'sorrowful']

### Based on the tag numbers from sad group or the happy group, we can assign a mood value 1(happy) or 0(sad) to the mood column 

In [30]:
df['mood']=""
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    tags = df.loc[row_id,'tags']    
    sad_tags = np.intersect1d(tags,sagTags) 
    happy_tags = np.intersect1d(tags,happyTags)
    if len(sad_tags)>0 or len(happy_tags)>0:# having mood tag
        if len(sad_tags)>len(happy_tags):
            df.loc[row_id,'mood'] = 0
        else:
            df.loc[row_id,'mood'] = 1
    else:
        df = df.drop(row_id)# remove songs that does not have tag
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [31]:
df.shape[0]

16

In [32]:
df['mood']

29     1
47     1
68     0
81     0
102    1
118    1
122    0
139    1
152    0
164    0
169    0
171    0
193    1
228    0
231    1
257    1
Name: mood, dtype: object

In [33]:
df

Unnamed: 0,file,artist,title,lyrics,tags,year,mood
29,TRBGASN128F427BF8F.h5,Jennifer Lopez,I'm Glad,"Baby, when I think about...\nThe day that we f...","[pop, rnb, jennifer lopez, dance, latin, femal...",2002,1
47,TRBGEUJ128F42B06B8.h5,Sandi Thom,I Wish I Was A Punk Rocker (With Flowers In My...,"Oh, I wish I was a punk rocker with flowers in...","[female vocalists, pop, acoustic, indie, singe...",2006,1
68,TRBGNCL128F428F56F.h5,Gwen Stefani,4 In The Morning,Waking up to find another day\nThe moon got lo...,"[pop, female vocalists, gwen stefani, Love, da...",2006,0
81,TRBGRIP128F147FCA6.h5,Phil Collins,Everyday,"I got lost, couldn't find my way\nAnd I guess ...","[pop, Phil Collins, 80s, soft rock, easy liste...",1993,0
102,TRBGXQP128F4290B22.h5,Roger Sanchez,Lost,"I saw a picture yesterday,\nYou know the one, ...","[House, dance, electronic, roger sanchez, deep...",2006,1
118,TRBGYVK128F426A0D6.h5,Joey Negro,Make A Move On Me,Come on make a move on me\n\nCome on make a mo...,"[House, dance, electronic, funky house, club, ...",2005,1
122,TRBGZKO128F92DB016.h5,Bloc Party,Zephyrus,"Backwards, forwards but making no ground at al...","[indie rock, alternative, british, Bloc Party,...",2008,0
139,TRBHEHF128F428385A.h5,George Michael,Amazing,I was a mixed up when you came to me\nToo brok...,"[pop, dance, George Michael, 80s, easy listeni...",2004,1
152,TRBHHUC128F4294C6F.h5,Scott Matthews,City Headache,City headache subsides your dreary tone\nSend ...,"[folk, singer-songwriter, acoustic, indie, Mel...",2006,0
164,TRBHKXX128F4252D02.h5,Radiohead,Everything In Its Right Place,"Kid A, Kid A\nKid A, Kid A\n\nEverything\nEver...","[alternative, electronic, radiohead, alternati...",2000,0


In [34]:
df['year'] = pd.Series('', index=df.index)

base = 'C:/Users/hp/Desktop/Music-Mood-Detection-using-Text-Mining-on-Lyrics--master/text mining on music lyrics/data/MSM'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    df.loc[row_id]['year'] = year

In [35]:
df

Unnamed: 0,file,artist,title,lyrics,tags,year,mood
29,TRBGASN128F427BF8F.h5,Jennifer Lopez,I'm Glad,"Baby, when I think about...\nThe day that we f...","[pop, rnb, jennifer lopez, dance, latin, femal...",2002,1
47,TRBGEUJ128F42B06B8.h5,Sandi Thom,I Wish I Was A Punk Rocker (With Flowers In My...,"Oh, I wish I was a punk rocker with flowers in...","[female vocalists, pop, acoustic, indie, singe...",2006,1
68,TRBGNCL128F428F56F.h5,Gwen Stefani,4 In The Morning,Waking up to find another day\nThe moon got lo...,"[pop, female vocalists, gwen stefani, Love, da...",2006,0
81,TRBGRIP128F147FCA6.h5,Phil Collins,Everyday,"I got lost, couldn't find my way\nAnd I guess ...","[pop, Phil Collins, 80s, soft rock, easy liste...",1993,0
102,TRBGXQP128F4290B22.h5,Roger Sanchez,Lost,"I saw a picture yesterday,\nYou know the one, ...","[House, dance, electronic, roger sanchez, deep...",2006,1
118,TRBGYVK128F426A0D6.h5,Joey Negro,Make A Move On Me,Come on make a move on me\n\nCome on make a mo...,"[House, dance, electronic, funky house, club, ...",2005,1
122,TRBGZKO128F92DB016.h5,Bloc Party,Zephyrus,"Backwards, forwards but making no ground at al...","[indie rock, alternative, british, Bloc Party,...",2008,0
139,TRBHEHF128F428385A.h5,George Michael,Amazing,I was a mixed up when you came to me\nToo brok...,"[pop, dance, George Michael, 80s, easy listeni...",2004,1
152,TRBHHUC128F4294C6F.h5,Scott Matthews,City Headache,City headache subsides your dreary tone\nSend ...,"[folk, singer-songwriter, acoustic, indie, Mel...",2006,0
164,TRBHKXX128F4252D02.h5,Radiohead,Everything In Its Right Place,"Kid A, Kid A\nKid A, Kid A\n\nEverything\nEver...","[alternative, electronic, radiohead, alternati...",2000,0


In [36]:
df.to_csv('lyrics_166.csv', index=False,encoding='utf-8')

In [37]:
df = df.drop("tags",axis=1)

In [38]:
len(df[df.mood==1])/166


0.04819277108433735

In [39]:
# save songs with mood tag into csv file for training and keywords extraction
df.to_csv('lyrics_166.csv', index=False,encoding='utf-8')