In [1]:
import json
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from langdetect import detect

In [2]:
contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "amn't": "am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "daren't": "dare not",
    "daresn't": "dare not",
    "dasn't": "dare not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "e'er": "ever",
    "everyone's": "everyone is",
    "finna": "going to",
    "gimme": "give me",
    "giv'n": "given",
    "gonna": "going to",
    "gon't": "go not",
    "gotta": "got to",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",    
    "he've": "he have",
    "howdy": "how do you do",
    "how're": "how are",
    "i'd": "I had",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "i'm'a": "I am about to",
    "i'm'o": "I am going to",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "ne'er": "never",
    "o'clock": "of the clock",
    "o'er": "over",
    "ol'": "old",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "somebody's": "somebody is",
    "someone's": "someone is",
    "something's": "something is",
    "so've": "so have",
    "so's": "so is",
    "so're": "so are", 
    "that'd": "that had",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "'tis": "it is",
    "'twas": "it was",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [3]:
def clean_lyrics(lyrics):
    if lyrics is None:
        return lyrics
    
    # combine lists of tokens into single string
    lyrics = ' '.join(lyrics)
            
    # remove song structure tags or instructions in brackets
    lyrics = re.sub(r'[\*\[|\(|\{].*\n*.*[\]\)\}\*]' , ' ', lyrics)
   
    # remove variations of Verse 1, VERSE 2, etc...
    for verse in ['verse', 'VERSE', 'Verse']:
        lyrics = re.sub(verse+' \d*', '', lyrics)
    
    # some structure markers formatted as allcaps without brackets
    for word in ['OUTRO', 'INSTRUMENTAL', 'PRE', 'HOOK',
                 'PRODUCED', 'REFRAIN', 'POST', 'REPEAT', '2x', '3x', '4x',
                 'CHORUS', 'INTRO', 'INTERLUDE']:
        lyrics = lyrics.replace(word, '')
        
    # remove varations of Chorus
    lyrics = re.sub(r'\n*Chorus:*.*' , ' ', lyrics)
    lyrics = re.sub(r'^Chorus:*.*' , ' ', lyrics)
    lyrics = re.sub(r'\nRepeat [C|c]horus:*.*' , ' ', lyrics)
    
    # remove variations of Intro
    lyrics = re.sub(r'Intro[\s|\n|:].*', ' ', lyrics)
    
    # remove variations of Instrumental
    lyrics = re.sub(r'-+.*[i|I]nstrumental.*-+', ' ', lyrics)
    lyrics = re.sub(r'\nBrief instrumental.*\n', ' ', lyrics)
    lyrics = re.sub(r'\nInstrumental', ' ', lyrics)
    lyrics = re.sub(r'\nInstrumental break', ' ', lyrics)
    lyrics = re.sub(r'\nInstrumental--', ' ', lyrics)
    lyrics = re.sub(r'\n~Instrumental~', ' ', lyrics)
    
    # remove variations of Bridge
    lyrics = re.sub(r'\n\[*Bridge:\[*', ' ', lyrics)
    
    # remove variations of Hook
    lyrics = re.sub(r'Hook:.*', ' ', lyrics)
    
    # remove varations of Repeat
    lyrics = re.sub(r'Repeat\s.*', ' ', lyrics)
    lyrics = re.sub(r'\nRepeat$', ' ', lyrics)
    
    # remove credits
    lyrics = re.sub(r'.*[P|p]roduced [B|b]y.*', ' ', lyrics)
    lyrics = re.sub(r'.*[W|w]ritten [B|b]y.*', ' ', lyrics)
    
    # remove strays and typos
    lyrics = re.sub(r'\[Outro\[', ' ', lyrics)
    lyrics = re.sub(r'Sax & background & instrumental\)', ' ', lyrics)
    lyrics = re.sub(r'\nSource: ', ' ', lyrics)
    lyrics = re.sub(r'Shotgun 2: 58 Trk 1 \n  \nJr. Walker & The All Stars '\
                    +'\nAnd/or The Funk Brothers - instrumental \nPop Chart '\
                    +'#4 Feb 13, 1965 \nSoul Label - 35008   \n ', ' ', lyrics)
    lyrics = re.sub(r'- musical interlude -', ' ', lyrics)
    lyrics = re.sub(r'\nRefrain:', ' ', lyrics)
            
    # expanding English language contractions
    for word in lyrics.split():
        if word.lower() in contractions:
            lyrics = lyrics.replace(word, contractions[word.lower()])
    
    # remove apostrophes
    lyrics = lyrics.replace('\'', '')

    # replace all punctuations with spaces
    lyrics = re.sub(r'[^\w\s]', ' ', lyrics)
            
    # replace consecutive whitespaces with single space
    lyrics = re.sub(r'\s+', ' ', lyrics)
    
    # convert all tokens to lowercase
    lyrics = lyrics.lower()
    
    if lyrics.isspace():
        return None
    
    if detect(lyrics) != 'en':
        return None

    if lyrics[:29] == 'we do not have the lyrics for' or lyrics == 'instrumental':
        return None
        
    return lyrics

In [4]:
with open('../data/top_hits_lyrics.json') as json_file:
    top_hits_lyrics = json.load(json_file)

In [5]:
top_hits_lyrics_df = pd.DataFrame(top_hits_lyrics)
top_hits_lyrics_df['clean_lyrics'] = top_hits_lyrics_df['lyrics'].apply(lambda x: clean_lyrics(x))
top_hits_lyrics_df = top_hits_lyrics_df[top_hits_lyrics_df['source'].notnull()]
top_hits_lyrics_df = top_hits_lyrics_df[top_hits_lyrics_df['clean_lyrics'].notnull()]

In [6]:
top_hits_lyrics_df.shape

(2769, 4)

In [7]:
with open('../data/top_hits.json') as json_file:
    top_hits = json.load(json_file)

In [8]:
top_hits_audio_features_df = pd.DataFrame(top_hits)

In [9]:
top_hits_merged_df = pd.merge(top_hits_lyrics_df, top_hits_audio_features_df, on='id', how='inner')

In [10]:
top_hits_merged_df['year'] = top_hits_merged_df['album'].apply(lambda x: x.get('release_date')[:4])
top_hits_merged_df['decade'] = top_hits_merged_df['year'].apply(lambda x: x[:3]+'0')

In [11]:
top_hits_merged_df.groupby(['decade']).count()['id']

decade
1960     14
1970    656
1980    706
1990    473
2000    507
2010    413
Name: id, dtype: int64

In [12]:
with open('../data/top_hits_merged_clean_lyrics_audio_features.json', 'w') as f:
        json.dump(top_hits_merged_df.to_json(), f)

In [13]:
with open('../data/songs_lyrics_5000.json') as json_file:
    not_hits_1 = json.load(json_file)

In [14]:
with open('../data/songs_lyrics_10000.json') as json_file:
    not_hits_2 = json.load(json_file)

In [15]:
not_hits_lyrics = not_hits_1 + not_hits_2

In [16]:
with open('../data/not_hits_lyrics.json', 'w') as f:
        json.dump(not_hits_lyrics, f)

In [17]:
not_hits_lyrics_df = pd.DataFrame(not_hits_lyrics)
not_hits_lyrics_df['clean_lyrics'] = not_hits_lyrics_df['lyrics'].apply(lambda x: clean_lyrics(x))
not_hits_lyrics_df = not_hits_lyrics_df[not_hits_lyrics_df['source'].notnull()]
not_hits_lyrics_df = not_hits_lyrics_df[not_hits_lyrics_df['clean_lyrics'].notnull()]

In [18]:
not_hits_lyrics_df.shape

(7597, 4)

In [19]:
with open('../data/songs.json') as json_file:
    not_hits = json.load(json_file)

In [20]:
not_hits_audio_features_df = pd.DataFrame(not_hits)

In [21]:
not_hits_merged_df = pd.merge(not_hits_lyrics_df, not_hits_audio_features_df, on='id', how='inner')

In [22]:
not_hits_merged_df['year'] = not_hits_merged_df['album'].apply(lambda x: x.get('release_date')[:4])
not_hits_merged_df['decade'] = not_hits_merged_df['year'].apply(lambda x: x[:3]+'0')

In [23]:
decades_top_hits = top_hits_merged_df.groupby(['decade']).count()['id']

In [24]:
appended_data = []
for decade, decade_count in decades_top_hits.iteritems(): 
    temp_df = not_hits_merged_df[not_hits_merged_df['decade'] == decade]
    sample = temp_df.sample(min(decade_count, temp_df.shape[0]))
    appended_data.append(sample)
    
not_hits_merged_df_sample = pd.concat(appended_data)

In [25]:
not_hits_merged_df_sample.shape

(2627, 38)

In [26]:
with open('../data/not_hits_merged_clean_lyrics_audio_features.json', 'w') as f:
        json.dump(not_hits_merged_df_sample.to_json(), f)