In [1]:
# https://janakiev.com/blog/jupyter-virtual-envs/
import pandas as pd
import numpy as np
from datetime import date, timedelta
import copy
import spotipy
import pickle
import json
import string
from tqdm import tqdm
import datetime
import math
from datetime import timedelta, datetime
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
SPOTIPY_CLIENT_ID="1af8231e16444a2eb708ce55225ea7bb"
SPOTIPY_CLIENT_SECRET="e3c233b23d1e42e4a4dba639b5913a9e"
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id = SPOTIPY_CLIENT_ID, 
                                                                         client_secret = SPOTIPY_CLIENT_SECRET))

In [3]:
def clean_string(_string):
    """ Clean strings so they can be stored and retrieved. """
    try:
        _string = _string.replace("'", "")
        _string = _string.replace('"', "")
    except:
        print(_string)
    return _string

def clean_song_name(_string):
    if "feat" in _string:
        _string = _string.split("feat",1)[0][:-1]
    elif "Feat" in _string:
        _string = _string.split("Feat",1)[0][:-1]
    return _string
        
def chance_the_rapper(x):
    if x == "CHANCE THE RAPPER":
        return "Chance the Rapper"
    return x

def load_data():
    """ Load, clean and enrich data. """
    
    # Load dataframes
    data = pd.read_json("MyData/endsong_0.json", convert_dates = ['ts'])
    data1 = pd.read_json("MyData/endsong_1.json", convert_dates = ['ts'])
    data2 = pd.read_json("MyData/endsong_2.json", convert_dates = ['ts'])
    data = data.append(data1)
    data = data.append(data2)
    
    # Get and rename relevant col names
    data = data[['ts', 'master_metadata_track_name', 'master_metadata_album_artist_name','master_metadata_album_album_name']]
    data = data.rename(columns = {"ts" : "endTime", 
                                  "master_metadata_track_name" : "trackName", 
                                  "master_metadata_album_artist_name" : "artistName",
                                  "master_metadata_album_album_name" : "albumName"})
    
    # Clean and enrich data
    data = data.dropna()
    data['artistName'] = data['artistName'].apply(clean_string)
    data['trackName'] = data['trackName'].apply(clean_string)
    data['trackName'] = data['trackName'].apply(clean_song_name)
    data['artistName'] = data.artistName.apply(chance_the_rapper)
    
    data = data[data.trackName != "Love Is Everywhere (Beware)"]
    data = data[data.trackName != "This American Life"]
    data = data.sort_values(by = "endTime")
    
    
    data['count'] = 1
    data['total_listens'] = data.groupby('artistName')['count'].transform(pd.Series.cumsum)
    
    
    # Section off relevant data
    artists = data.groupby(['artistName']).count()
    relevant = artists[artists.endTime > 20].index
    relevant_data = data.loc[data.artistName.isin(relevant)].sort_values(by = "total_listens")
    
    data = data.reset_index()
    relevant_data = relevant_data.reset_index()
    return data, relevant_data

In [5]:
data,relevant_data = load_data()

In [6]:
def get_song_artist(song_name, artist_name):
    """ Get Artist Id using song and artist name.
    
    Searching only on artist name can provide faulty results,
    such as Nas return Lil Nas X, or Whitney returning
    Whitney Houston. Use a song as a secondary key, essentially.
    
    """
    results = sp.search(q='track:%s artist:%s'%(song_name, artist_name), type='track')
    items = results['tracks']['items']
    if len(items) > 0:
        return items[0]['artists'][0]['id']
    else:
        return None

    
def get_artist_genre(artist_id):
    """ Return genre of an artist. """
    try:
        artist_info = sp.artist(artist_id)
        return artist_info['genres']
    except:
        return None

In [7]:
def save_artist_id(data):
    """ Accumulare and save mapping of artist to id.
    
    Spotify has an ID value for each artist that is needed
    for further searching, i.e. looking up an artists genre.
    Create and save this mapping because it is time
    intensive with all the API calls.
    
    """
    artist_id_dic = {}
    for index, row in data.iterrows():
        artist_name = row['artistName']
        if artist_name in artist_id_dic.keys():
            continue
        track_name = row['trackName']
        artist_id = get_song_artist(track_name, artist_name)
        if artist_id is not None:
            artist_id_dic[artist_name] = artist_id
    with open('artist_id.pickle', 'wb') as handle:
        pickle.dump(artist_id_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def save_genre_data(data, artist_id_dic):
    """ Accumulate and save data about genres listened to.
    
    Create two dictionaries:
        1) Map artist to their genres
        2) Count how many artists per genre
    
    The second dictionary could be created after this, if
    desired, but I just threw it in here as well.
    
    """
    genre_dic = {}
    artist_genre_dic = {}
    for artist_name in data.artistName.unique():
        
        # Some artists had no id found, so are not in
        # the artist id lookup dictionary
        if artist_name not in artist_id_dic:
            continue
        
        genres = get_artist_genre(artist_id_dic[artist_name])
        
        # If there are no genres, still add to the dictionary
        # to not cause errors later when looping through the 
        # keys.
        if genres is None:
            artist_genre_dic[artist_name] = None
            continue
        
        artist_genre_dic[artist_name] = genres
        for g in genres:
            if g in genre_dic:
                genre_dic[g] += 1
            else:
                genre_dic[g] = 1
    with open('artist_genre.pickle', 'wb') as handle:
        pickle.dump(artist_genre_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('genre_dic.pickle', 'wb') as handle:
        pickle.dump(genre_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

        
def create_genre_to_artist_dic(artist_genres):
    """ Create mapping of genre to all artist that classify as that genre.
    
    With a mapping created from genre to artist, it will be easy
    to look at all the songs in a certain genre.
    
    """
    genres_to_artists = {}
    artists_seen = set()
    for artist, genres in artist_genres.items():
        
        # Don't want duplicate artists
        if artist in artists_seen:
            continue
        
        for g in genres:
            if g in genres_to_artists:
                genres_to_artists[g].append(artist)
            else:
                genres_to_artists[g] = [artist]
        artists_seen.add(artist)
    
    return genres_to_artists

In [8]:
try:
    file = open("artist_id.pickle", "rb")
    artist_id_dic = pickle.load(file)
except:
    save_artist_id(data)
    

try:
    file = open("artist_genre.pickle", "rb")
    artist_genres = pickle.load(file)
    file = open("genre_dic.pickle", "rb")
    genre_dic = pickle.load(file)
except:
    save_genre_data(data, artist_id_dic)

genre_to_artist = create_genre_to_artist_dic(artist_genres)

genres_df = pd.DataFrame.from_dict(genre_dic, orient = 'index', columns = ['count'])

In [9]:
def get_part_of_day_helper(hour):
    return (
        "morning" if 3 <= hour <= 11
        else
        "afternoon" if 12 <= hour <= 17
        else
        "evening" if 18 <= hour <= 21
        else
        "night"
    )

def get_part_of_day(data):
    data['part_of_day'] = data['endTime'].apply(lambda x: 
                                                get_part_of_day_helper(x.hour))

def get_season_helper(month):
    return (
        "spring" if 4 <= month <= 5
        else
        "summer" if 6 <= month <= 8
        else
        "fall" if 9 <= month <= 10
        else
        "winter"
    )

def get_season(data):
    data['season'] = data['endTime'].apply(lambda x: 
                                                get_season_helper(x.month))

def get_day_of_week(data):
    weekday_dic = {
        0 : "weekday",
        1 : "weekday",
        2 : "weekday",
        3 : "weekday",
        4 : "weekend",
        5 : "weekend",
        6 : "weekend"
    }
    data['weekday'] = data['endTime'].apply(lambda x: weekday_dic[x.dayofweek])

def get_year(data):
    data['year'] = data['endTime'].apply(lambda x: str(x.year))


In [10]:
def nested_dic_helper(data_dic, row, cat_index, categories):
    i = 0
    curr_data = data_dic
    while i < cat_index:
        curr_data = curr_data[row[i]]
        i += 1
        
    # BASE CASE
    if cat_index == len(categories) - 1:
        if row[cat_index] in curr_data.keys():
            curr_data[row[cat_index]] += 1
        else:
            curr_data[row[cat_index]] = 1
        return
    
    if row[cat_index] not in curr_data.keys():
        curr_data[row[cat_index]] = {}
    
    return nested_dic_helper(data_dic, row, cat_index + 1, categories)


def create_nested_dictionary(data, categories):
    data_dic = {}
    for _, row in data.iterrows():
        data_of_cats = [row[cat] for cat in categories]
        nested_dic_helper(data_dic, data_of_cats, 0, categories)
    return data_dic


def final_nest_helper(data_dic, level, num_cats):
    if level == num_cats - 1:
        return [{"name" : k, "value" : data_dic[k]} for k in data_dic.keys()]
    return [{"name" : k, "children" : final_nest_helper(data_dic[k], level + 1, num_cats)
                             } for k in data_dic.keys()]


def create_final_nest(data_dic, categories):
    final_dic = {"name" : "songData", "children" : []}
    final_dic['children'] = final_nest_helper(data_dic, 0, len(categories))
    return final_dic


def create_category_heirarchy(data, categories):
    nested_dictionary = create_nested_dictionary(data, categories) 
    final_nest = create_final_nest(nested_dictionary, categories)
    return final_nest

In [12]:
def get_songs_with_genre(data, artist_genres):
    """ Same as top_artist_of_genre just with songs. """
    df = data
    df['genres'] = df['artistName'].apply(lambda x : artist_genres[x] if x in artist_genres.keys() and artist_genres[x] else "No Genre")
    df = df[df.genres != "No Genre"]
    lst_col = 'genres'
    r = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    r['total_listens_genre'] = r.groupby(["genres"])['count'].transform(pd.Series.cumsum)
    return r

In [13]:
def enrich_data(data):
    """ Run enrichers on data. """
    enriched_data = data
    get_part_of_day(enriched_data)
    get_season(enriched_data)
    get_day_of_week(enriched_data)
    get_year(enriched_data)
    return get_songs_with_genre(enriched_data, artist_genres)

In [14]:
def create_time_hierarchy_data(data):
    """ For each genre to be published, create time hierarchy dictionary.
    
    Unfortunately, I am skilled enough at D3 to be doing all 
    the parsing of the data within D3. So, I'll make a dictionary
    for each individual genre, which can be loaded dynamically
    when a genre is chosen.
    
    """
    enriched_data = enrich_data(data)
    enriched_data.to_csv('enriched_song_data.csv')

    # Get genres necessary for final output
    genre_hi = create_genre_hierarchy(genres_df)
    final = format_genre_hierarchy(genre_hi)
    used_genres = set()
    for d in final:
        used_genres.add(d['id'].strip())

    # Create time hierarchy for each genre
    for genre in used_genres:
        if genre != "Genre":
            df = enriched_data[enriched_data.genres == genre]
        else:
            df = enriched_data
        nest = create_category_heirarchy(df, ['year', 'season','weekday'])
        stripped_genre = genre.replace(" ", "").strip()
        with open('time_hierarchy_data/%s.json' % stripped_genre, 'w') as outfile:
            json.dump(nest, outfile)

In [15]:
def create_time_hierarchy_with_artist_data(data, categories = ['year', 'season','artistName','trackName'],
                                          filename = 'time_hierarchy_with_artist_data'):
    """ For each genre to be published, create time hierarchy dictionary.
    
    Unfortunately, I am skilled enough at D3 to be doing all 
    the parsing of the data within D3. So, I'll make a dictionary
    for each individual genre, which can be loaded dynamically
    when a genre is chosen.
    
    """
    enriched_data = enrich_data(data)
    enriched_data.to_csv('enriched_song_data.csv')

    # Get genres necessary for final output
    genre_hi = create_genre_hierarchy(genres_df)
    final = format_genre_hierarchy(genre_hi)
    used_genres = set()
    for d in final:
        used_genres.add(d['id'].strip())

    # Create time hierarchy for each genre
    for genre in used_genres:
        if genre != "Genre":
            df = enriched_data[enriched_data.genres == genre]
        else:
            df = enriched_data
        nest = create_category_heirarchy(df, categories)
        stripped_genre = genre.replace(" ", "").strip()
        with open(filename + ('/%s.json' % stripped_genre), 'w') as outfile:
            json.dump(nest, outfile)

In [291]:
create_time_hierarchy_data(data)

In [351]:
create_time_hierarchy_with_artist_data(data)

In [20]:
create_time_hierarchy_with_artist_data(data, ['artistName', 'trackName'], 'artist_hierarchy')

In [17]:
def create_genre_hierarchy(genres_df):
    ''' Create hierarchy as genres based on names of genres. 
    
    The hierarchy created in this function is solely based on title. So,
    if the word of one genre, 'Rock', exists in another genre, 'Indie Rock'
    or 'Alternative Rock' or 'Classic Rock', then 'Rock' becomes the parent
    to those genres. 
    
    '''
    
    # Load the DF and add some more features
    genre_hierarchy_df = genres_df
    genre_hierarchy_df['genre'] = genre_hierarchy_df.index
    genre_hierarchy_df['num_words_genre'] = genre_hierarchy_df['genre'].apply(lambda x: 
                                                    len(x.split()))
    genre_hierarchy_df['num_chars_genre'] = genre_hierarchy_df['genre'].apply(lambda x: 
                                                    len(x))
    longest_genre = max(genre_hierarchy_df['num_words_genre'])

    # Initialize dictionary
    nested_genres = {}

    # Want to loop backwards to start with bottom layer
    # and then add genres on top of that. For example, start
    # with Modern Alternative Rock then make that a child of
    # Alternative Rock then make that a child of Rock
    for genre_length in range(longest_genre, 0, -1):
        
        df = genre_hierarchy_df[genre_hierarchy_df.num_words_genre == genre_length]
        df = df.sort_values(by = 'num_chars_genre')
        
        # Loop through each genre
        for genre, count in zip(df['genre'], df['count']):
            
            nested = False
            
            # Look at the already existing genres, so these will be
            # the longer genres that have already been added
            for k in list(nested_genres.keys()):
                
                # If the current genre is a subet of one of the 
                # already existing. For example, "rock" in 
                # "alternative rock". If this happens, we want
                # to create a chlid parent relationship
                if genre in k:
                    
                    # If k-value genre, aka the potential child,
                    # is a sub-genre and the only child of its
                    # parent, then delete. Otherwise, every genre
                    # would have a child of itself, which looks messy.
                    if "sub" in k:
                        parent = nested_genres[k]['parent']
                        if nested_genres[parent]['num_child'] == 1:
                            del nested_genres[k]
                            nested_genres[parent]['num_child'] -= 1
                            continue
                    
                    
                    # If the potential child already has a parent,
                    # then move on. This is so "Modern Alternative
                    # Rock" stays with the parent "Alternative Rock",
                    # rather than getting overridden by "Rock".
                    if nested_genres[k]['parent'] != "Genre":
                        continue
                    
                    # If the genre that is about to become a parent
                    # does not already exist, then initialize. 
                    if not nested:
                        nested_genres[genre] = {"id" : genre, "value" : int(count), "parent" : "Genre", 
                                                    "num_child" : 0}

                    # Set the parent appropriately and update
                    # the parents total value and children. 
                    nested_genres[k]['parent'] = genre
                    nested_genres[genre]['num_child'] += 1
                    nested_genres[genre]['value'] += nested_genres[k]['value']
                    nested = True
            
            # If the genre did not become a parent to any genre,
            # then initialize the genre with the generic parent of
            # 'Genre'. 
            if not nested:
                nested_genres[genre] = {"id" : genre, "value" : int(count), "parent" : "Genre", "num_child" : 0}
            
            # And the genre itself as a child of itself. Alter the
            # key and id so no duplicates.
            nested_genres["sub_" + genre] = {"id" : genre + " ", "value" : int(count), 
                                                     "parent" : genre, "num_child" : 0}
            nested_genres[genre]['num_child'] += 1
                
    return nested_genres


In [21]:
def format_genre_hierarchy(genre_hi, delete_prop = 0.01):
    """ Format genre hierarchy for D3 use.
    
    In order to the use the hierarchy - created in
    "create_genre_hierarchy" - in D3, the data must 
    be molded into a specific format. Also, this function
    prunes out irrelevant genres, as defined by 
    making up less than 1% of its parent genre.
    """

    final = []
    
    # Make copy so original values are stored.
    value_dic = copy.deepcopy(genre_hi)

    parents_added = set()
    parents_deleted = set()
    
    # Adds the root, needed for D3.
    final.append({"id" : "Genre"})
    parents_added.add("Genre")
    
    # Computes the total of everything, as the
    # genres with parent of Genre have the counts
    # of all its children.
    genre_count = 0
    for k in genre_hi.keys():
        if genre_hi[k]['parent'] == "Genre":
            genre_count += genre_hi[k]['value']
    
    # Keys will be deleted once added to the final
    # list.
    while len(genre_hi.keys()) > 0:
        for k in list(genre_hi.keys()):
            
            parent = genre_hi[k]['parent']
            
            # Pruning: If the genre does not make up more than 1%
            # of its parent's total, then prune.
            total = genre_count if parent == "Genre" else value_dic[parent]['value']
            if float(value_dic[k]['value']) / total < delete_prop:
                parents_deleted.add(k)
                del genre_hi[k]
                continue
            
            # Only children should have values, otherwise D3 double
            # counts the children and the parent values.
            if genre_hi[k]['num_child'] > 0:
                genre_hi[k]['value'] = 0

            # If parent is deleted, then child deleted too.
            if parent in parents_deleted:
                parents_deleted.add(k)
                del genre_hi[k]
                continue
            
            # If parent is added, then child added too.
            if parent in parents_added:
                final.append(genre_hi[k])
                parents_added.add(k)
                del genre_hi[k]

    return final


In [116]:
genre_hi = create_genre_hierarchy(genres_df)
final_df = pd.DataFrame(format_genre_hierarchy(genre_hi))
final_df.to_csv("genre_hierarchy.csv")

In [23]:
def top_artists_of_genre(data, artist_genres):
    """ Get all time listens and genres for each artist.
    
    As of now, get all time listens. This may have to change
    if an adjustable timeline is added. Need a df with keys
    artistName and genre for querying in d3.
    """
    
    # Only take the max number of listens, which will be
    # the total number of listens at the end of time
    top_artists = data.groupby(['artistName'])['total_listens'].max()
    df = pd.DataFrame(top_artists)
    df = df.reset_index()
    df = df.rename(columns = {"level_0" : "artist"})
    
    # Get genre data from the artist genres dictionary.
    df['genres'] = df['artistName'].apply(lambda x : artist_genres[x] if x in artist_genres.keys() and artist_genres[x] else "No Genre")
    df = df[df.genres != "No Genre"]
    
    # Expand list of genres so each genre get 
    # its own row with the artist
    lst_col = 'genres'
    r = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    
    # Sort and save
    r = r.sort_values(by = 'total_listens', ascending = False)
    r.to_csv("total_listens_artist.csv")
    no_dups = r.drop_duplicates(['artistName'])
    no_dups.to_csv("total_listens_artist_all_genres.csv")

def top_songs_of_genre(data, artist_genres):
    """ Same as top_artist_of_genre just with songs. """
    
    data['total_listens_track'] = data.groupby(['trackName','artistName'])['count'].transform(pd.Series.cumsum)

    top_songs = data.groupby(['trackName','artistName'])['total_listens_track'].max()
    df = pd.DataFrame(top_songs)
    df = df.reset_index()
    df = df.rename(columns = {"level_0" : "artist"})

    df['genres'] = df['artistName'].apply(lambda x : artist_genres[x] if x in artist_genres.keys() and artist_genres[x] else "No Genre")
    df = df[df.genres != "No Genre"]

    lst_col = 'genres'
    r = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    r = r.sort_values(by = 'total_listens_track', ascending = False)
    r.to_csv("total_listens_track.csv")
    no_dups = r.drop_duplicates(['trackName','artistName'])
    no_dups.to_csv("total_listens_track_all_genres.csv")
    

In [24]:
top_artists_of_genre(data, artist_genres)
top_songs_of_genre(data, artist_genres)

In [284]:
def fill_time_period(df, dic, start, end):
    """ Get listening history between given dates. """
    
    df = df.loc[df['endTime'] <= end]
    df = df.loc[df['endTime'] >= start]
    
    # Get the total number of listens up to this point in time
    # (meaning listens from beginning of time until current date)
    idx = df.groupby(['artistName'])['total_listens'].transform(max) == df['total_listens']
    dic[end] = df[idx][['artistName', 'total_listens']]
    

def get_time_periods_dfs(data, _weeks = 2):
    """ Fill dictioanry mapping dates to time period dataframes. 
    
    A time period dataframe is created by the fill_time_period
    function and holds the total listens for each artist up
    to the end of that time period.
    
    """    
    
    time_period_data = {}
    final_end_date = data.iloc[-1]['endTime']
    
    # If there is not enough data, do not loop
    total_weeks = (final_end_date - data.iloc[0]['endTime']).days / 7
    if total_weeks < _weeks:
        fill_time_period(data, time_period_data, data.iloc[0]['endTime'], final_end_date)
        return time_period_data
        
    # Start input weeks into the future and look back
    # when gather data
    date = data.iloc[0]['endTime']  + timedelta(weeks = _weeks)
    while date <= final_end_date:
        time_period_data[date] = None
        start_date = date - timedelta(weeks = _weeks)
        fill_time_period(data, time_period_data, start_date, date)
        date += timedelta(weeks = _weeks)
    
    return time_period_data

def artist_level_data(data, time_period_data):
    """ Create timeseries data for each artist. 
    
    For every artist, create a dataframe that holds
    the total listens up to given dates, where the dates
    are every few weeks (depends on _weeks parameter of
    get_time_periods_dfs) from the beginning of time until
    the end.
    
    """
    
    artist_data = {}
    artist_count = 0
    for artist in list(data.artistName.unique()):
        
        # Initialize Variables
        artist_data[artist] = {}
        first_date = True
        week_counter = 1
        
        for date, total in time_period_data.items():
            artist_data[artist][date] = {}
                        
            # Get the current total listens up to this time frame
            artist_sum = total[total.artistName == artist]
            if len(artist_sum.index) == 0:
                artist_data[artist][date]["sum"] = 0 if first_date else artist_data[artist][prev_date]["sum"]
            else:
                artist_data[artist][date]["sum"] = artist_sum['total_listens'].iloc[0]
            
            # Reset Variables
            artist_data[artist][date]['week'] = week_counter
            prev_date = date
            week_counter += 1
            if first_date:
                first_date = False
    
    # Reformate data 
    df = pd.DataFrame.from_dict({(i, j): artist_data[i][j]
            for i in artist_data.keys()
            for j in artist_data[i].keys()},
            orient = 'index')

    return df

In [26]:
def get_songs_with_genre_simple(data, artist_genres):
    """ Add genre to every song in listening history. """
    
    # Add list of genres to every song
    df = data
    df['genres'] = df['artistName'].apply(lambda x : artist_genres[x] if x in artist_genres.keys() and artist_genres[x] else "No Genre")
    df = df[df.genres != "No Genre"]
    
    # Mutate so every song has a row for each of its
    # genres.
    lst_col = 'genres'
    r = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    r['total_listens_genre'] = r.groupby(["genres"])['count'].transform(pd.Series.cumsum)
    return r



In [30]:
songs_with_genre = get_songs_with_genre_simple(data, artist_genres)

In [358]:
def create_genre_artist_timeline(data, artist_genres):
    """ For each genre to be published, create time hierarchy dictionary.
    
    Unfortunately, I am skilled enough at D3 to be doing all 
    the parsing of the data within D3. So, I'll make a dictionary
    for each individual genre, which can be loaded dynamically
    when a genre is chosen.
    
    """
    songs_with_genre = get_songs_with_genre_simple(data, artist_genres)

    # Get genres necessary for final output
    genre_hi = create_genre_hierarchy(genres_df)
    final = format_genre_hierarchy(genre_hi)
    used_genres = set()
    for d in final:
        used_genres.add(d['id'].strip())

    # Create time hierarchy for each genre
    genre_count = 0
    for genre in tqdm(used_genres):

        if genre == "Genre":
            df = songs_with_genre
        else:
            df = songs_with_genre[songs_with_genre.genres == genre]
            
        # Only look at the top 10 artists
        artists = df.groupby(['artistName']).count()
        relevant = artists.sort_values(by = 'count', ascending = False).iloc[0:10].index
        relevant_data = df.loc[df.artistName.isin(relevant)].sort_values(by = "total_listens")
        
        # Fill a timeline dataset
        artist_over_time = artist_level_data(relevant_data, get_time_periods_dfs(relevant_data))
        
        # Reorganize and take select columns
        df = artist_over_time
        df = df.reset_index()
        df = df.rename(columns = {"level_0" : "artist", "level_1" : "endTime"})
        df['date'] = df['endTime'].apply(lambda x : str(x.date()))
        df = df[['artist','sum','date']]
        
        # Mutate for D3 acceptable format
        newf = df.pivot(index='artist', columns='date')
        newf.columns = newf.columns.droplevel(0)
        newf.to_csv("artist_over_time/%s.csv" % genre)
        

In [360]:
create_genre_artist_timeline(data, artist_genres)

In [286]:
def create_bar_race_data(data, artist_genres):
    songs_with_genre = get_songs_with_genre(data, artist_genres)

    # Get genres necessary for final output
    genre_hi = create_genre_hierarchy(genres_df)
    final = format_genre_hierarchy(genre_hi)
    used_genres = set()
    for d in final:
        used_genres.add(d['id'].strip())

    # Create time hierarchy for each genre
    genre_count = 0
    for genre in tqdm(used_genres):
        
        if genre == "Genre":
            df = songs_with_genre
        else:
            df = songs_with_genre[songs_with_genre.genres == genre]
        
        # Fill a timeline dataset
        artist_over_time = artist_level_data(df, get_time_periods_dfs(df))
        
        # Reorganize and take select columns
        df = artist_over_time
        df = df.reset_index()
        df = df.rename(columns = {"level_0" : "artist", "level_1" : "endTime"})
        df['date'] = df['endTime'].apply(lambda x : str(x.date()))
        df.to_csv("bar_race_data/%s.csv" % genre.strip())


In [287]:
create_bar_race_data(data, artist_genres)

100%|██████████| 197/197 [13:44<00:00,  4.19s/it]  


In [414]:
def create_dependency_graph(df):

#     df = data
    df['genres'] = df['artistName'].apply(lambda x : artist_genres[x] if x in artist_genres.keys() and artist_genres[x] else "No Genre")
    df = df[df.genres != "No Genre"]
    df = df.drop_duplicates('artistName')

    genre_dependency_dic = {}

    genre_hi = create_genre_hierarchy(genres_df)
    final = format_genre_hierarchy(genre_hi, 0.01)
    used_genres = set()
    for d in final:
        used_genres.add(d['id'].strip())

    genre_hi = create_genre_hierarchy(genres_df)

    for _, row in df.iterrows():

        genres = row['genres']
        artistName = row['artistName'].replace(" ", "")

        for g in genres:

            # Only want to look at genres that are published
            if g not in used_genres:
                continue

            parent = genre_hi[g]['parent']
            if parent != 'Genre':
                continue


            if g not in genre_dependency_dic:
                name = "genre." + parent + "." + g 
                genre_dependency_dic[g] = {"name" : name,
                                             "imports" : []}

            for g_ in genres:
                if g_ not in used_genres:
                    continue

                parent = genre_hi[g_]['parent']
                if parent != 'Genre':
                    continue

                name = "genre." + parent + "." + g_ 
                genre_dependency_dic[g]["imports"].append(name)

    dependency_graph = []
    for v in genre_dependency_dic.values():
        dependency_graph.append(v)
    
    return dependency_graph
    
        

In [415]:
dependency_graph = create_dependency_graph(data)
with open('dependency_graph.json', 'w') as outfile:
            json.dump(dependency_graph, outfile)