In [89]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams

rcParams['font.family'] = 'serif'
rcParams['font.serif'] = 'times new roman'

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

We convert the data, which contains one row per song so that it instead contains one row per unique word per song, with all other attributes of the song duplicated for all unique words in the song. Furthermore, each unique word will have a count of its frequency for that song.

In [90]:
def row_to_df(x):
    if not isinstance(x['lyrics'], str):
        return None
    else:
        nonlyrics = x.drop('lyrics')
        lyrics = pd.Series(x.lyrics.split()).value_counts()
        lyrics = pd.DataFrame(lyrics).reset_index()
        lyrics.columns = ['word', 'count']
        lyrics['dummy'] = 1
        nonlyrics['dummy'] = 1
        nonlyrics = pd.DataFrame(nonlyrics).T
        return pd.merge(nonlyrics, lyrics, how='right', on='dummy')

def tidy_df(df):
    df_list = list()
    for index, row in df.iterrows():
        song_df = row_to_df(row)
        df_list.append(song_df)
    return pd.concat(df_list, axis=0)

Add a few more song-level attributes before tranforming dataset.

In [95]:
df = pd.read_csv('../data/billboard-spotify.csv', encoding='latin1')



# word count of song lyrics
count_words = lambda x: len(x.split()) if isinstance(x, str) else 0
df['num_words'] = df['lyrics'].apply(count_words)

# number of words per second
df['words_per_sec'] = df['num_words'] / (df['duration_ms'] / 1000)

# song duration in minutes
df['duration_min'] = df['duration_ms'] / 1000 / 60

# the primary artist (removes featured artists)
df['artist_base'] = df['artist'].apply(lambda x: re.sub("\s\(*feat.*", '', x))

In [96]:
df_tidy_words = tidy_df(df)

Keep only words that appear in at least 10 songs.

In [113]:
songs_per_word = df_tidy_words.groupby('word')['song'].agg(pd.Series.nunique)
songs_per_word = songs_per_word[songs_per_word >= 10]
words_to_keep = list(songs_per_word.index)

In [114]:
mask = df_tidy_words['word'].apply(lambda x: x in words_to_keep)
df_tidy_words_filtered = df_tidy_words.loc[mask,:]

In [115]:
df_tidy_words_filtered.to_csv('../data/tidy-words.csv', index=False)