# Explore Music  Similarity & Popularity

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
from networkx.algorithms import community

## Data Imports

In [2]:
df_track = pd.read_csv('../data/track.csv')
df_album = pd.read_csv('../data/album.csv')
df_artist = pd.read_csv('../data/artist.csv')
df_rating = pd.read_csv('../data/rating.csv')
df_similar = pd.read_csv('../data/track_similarity.csv')
df_weekly_rating = pd.read_csv('../data/weekly_rating.csv')

## Useful Functions

In [3]:
# ======================= Convert Names to IDs =============================

def convert_artist_name_to_id(df_artist, artist_name):
    return df_artist[df_artist['name'] == artist_name]['artist_id'].values[0]

def convert_album_name_to_id(album_name):
    return df_album[df_album['name'] == album_name]['album_id'].values[0]

def convert_track_name_to_id(track_name):
    return df_track[df_track['name'] == track_name]['track_id'].values[0]

# ======================= Convert Ids to Names =============================

def convert_track_id_to_name(track_id):
    return df_track[df_track['track_id'] == track_id]['name'].values[0]

def convert_album_id_to_name(album_id):
    return df_album[df_album['album_id'] == album_id]['name'].values[0]

def convert_artist_id_to_name(artist_id):
    return df_artist[df_artist['artist_id'] == artist_id]['name'].values[0]

# ======================= Others =============================

def get_all_artist_albums(artist_name, byName=False):
    all_albums = df_artist[df_artist['name'] == artist_name]['all_albums'].values[0]
    all_albums = all_albums[1:-1].split(',')
    all_albums = [int(album) for album in all_albums]

    if byName:
        all_albums = [convert_album_id_to_name(album) for album in all_albums]

    return all_albums

def get_all_artist_tracks(artist_name, byName=False):
    all_tracks = df_artist[df_artist['name'] == artist_name]['all_tracks'].values[0]
    all_tracks = all_tracks[1:-1].split(',')
    all_tracks = [int(track) for track in all_tracks]

    all_tracks = [convert_track_id_to_name(track) for track in all_tracks]

    return all_tracks

## Track Similarity

In [None]:
# print all track ids that do not have a similarity value
temp_list = []
track_count = len(df_track)

track_ids = df_track['track_id'].to_list()
for track_id in track_ids:
    if df_similar[(df_similar['track_id_1'] == track_id) | (df_similar['track_id_2'] == track_id)].empty:
        temp_list.append(track_id)

print('Track ids without similarity value:', f"{temp_list[:10]} ..." if len(temp_list) > 10 else temp_list)
print('Number of track ids without similarity value:', len(temp_list))
print('Percentage of track ids without similarity value:', round(len(temp_list) / track_count * 100, 2), '%')

In [None]:
df_similar.head(10)

In [None]:
G = nx.Graph()
plt.figure(figsize=(30, 30), dpi=150)

# Add edges with weights
for _, row in df_similar.iterrows():
    t1 = row['track_name_1'].replace('$', '_')
    t2 = row['track_name_2'].replace('$', '_')
    G.add_edge(t1, t2, weight=row['sim_degree'])

# Find communities and assign colors
communities = community.greedy_modularity_communities(G)
node_colors = {}
for idx, comm in enumerate(communities):
    for node in comm:
        node_colors[node] = idx

pos = nx.spring_layout(G)

nx.draw_networkx_nodes(G, pos, node_size=300,
    node_color=[node_colors[node] for node in G.nodes()]
)

nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True),
    width=[edge[2]['weight'] * 5 for edge in G.edges(data=True)]
)

nx.draw_networkx_labels(G, pos, font_size=5, font_color="black")
plt.title("Song Similarity Graph")
plt.show()

In [None]:
# Calculate the listeners of a track
def get_all_listeners(track_id):

    # filter the track
    track_data = df_track[df_track['track_id'] == track_id]
    track_data = track_data.reset_index(drop=True)

    # return the sum of the lastfm_playcount,lastfm_listeners 
    if track_data['lastfm_playcount'].values[0] != 'nan':
        ret1 = track_data['lastfm_listeners'].values[0]
    else: 
        ret1 = 0
    
    if track_data['lastfm_listeners'].values[0] != 'nan':
        ret2 = track_data['lastfm_playcount'].values[0]
    else:
        ret2 = 0

    return ret1 + ret2

# add a new column to the dataframe with the total number of listeners
df_track['total_listeners'] = df_track['track_id'].apply(get_all_listeners)
df_track.head()

In [None]:
artist_total_listeners = df_track.groupby('artist_id').agg({'total_listeners': 'sum'}).reset_index()
artist_total_listeners['artist_name'] = artist_total_listeners['artist_id'].apply(convert_artist_id_to_name)
artist_total_listeners = artist_total_listeners.sort_values(by='total_listeners', ascending=False)

# filter out every artist with 0 listeners
print(artist_total_listeners[artist_total_listeners['total_listeners'] == 0].shape)
artist_total_listeners = artist_total_listeners[artist_total_listeners['total_listeners'] > 0]
print(artist_total_listeners.shape)

# sort the artists by the total number of listeners
artist_total_listeners = artist_total_listeners.sort_values(by='total_listeners', ascending=False)

artist_total_listeners.head()

## Intra-Artist Similarity

In [9]:
# TODO FIND ARTISTS SIMILAR WITH THEMSELVES

## Inter-Artist Similarity

In [None]:
# group by artist_id1 and 2 and get the mean sim_degree
df_similar_artists = df_similar.groupby(['artist_name_1', 'artist_name_2']).agg({'sim_degree': 'mean'}).reset_index()
print(df_similar_artists.shape)
df_similar_artists.head()

In [None]:
# remove all artists that have " and " or " & " or " featuring " in their name
artist_names = set(df_similar_artists['artist_name_1'].to_list() + df_similar_artists['artist_name_2'].to_list())

count = 0
for name in artist_names:
    if ' and ' in name or ' & ' in name or ' featuring ' in name:
        count += 1

print(f'Dropped {count} artists with " and ", " & " or " featuring " in their name')
df_similar_artists = df_similar_artists[~df_similar_artists['artist_name_1'].str.contains(' and | & | featuring ', case=False)]

In [None]:
# remove artists that are not in the artist table (i.e. they have no listeners) 
artist_names = set(artist_total_listeners['artist_name'].to_list())

for row in df_similar_artists.iterrows():
    if row[1]['artist_name_1'] not in artist_names or row[1]['artist_name_2'] not in artist_names: # drop the row if both artists are not in the list
        df_similar_artists.drop(row[0], inplace=True)

print(df_similar_artists.shape)
df_similar_artists.head()

## Artist Popularity By Listener Count

In [None]:
# get the listeners of a track
track_name = 'Blank Space'
track_id = convert_track_name_to_id(track_name)
listeners = get_all_listeners(track_id)
print(f'Total listeners of "{track_name}": {listeners}')


In [None]:
G = nx.Graph()

for _, row in df_similar_artists.iterrows():
    G.add_edge(row['artist_name_1'], row['artist_name_2'], weight=row['sim_degree'])

# Set layout
plt.figure(figsize=(40, 40), dpi=200)
pos = nx.spring_layout(G, k=0.7, seed=42)
listeners_data = dict()

for row in artist_total_listeners.iterrows():
    listeners_data[row[1]['artist_name']] = row[1]['total_listeners']

max_listeners = max(listeners_data.values())
min_listeners = min(listeners_data.values())
norm = mcolors.Normalize(vmin=min_listeners, vmax=max_listeners)
cmap = cm.get_cmap("coolwarm")

node_colors = {
    artist: cmap(norm(listeners_data[artist]))
    for artist in G.nodes()
}

pos = nx.spring_layout(G, k=0.5, seed=42)
sorted_nodes = sorted(G.nodes(), key=lambda node: listeners_data[node])

for node in sorted_nodes:
    nx.draw_networkx_nodes(G, pos, nodelist=[node], node_size=[5000],
        node_color=[node_colors[node]]
    )

nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width= 0.1)
nx.draw_networkx_labels(G, pos, font_size=4, font_color="black")
plt.title("Artist Similarity Graph with Listener-Based Coloring")
plt.show()

In [None]:
artist_total_listeners.head(20)

## Find Unpopolar Artists Similar to Very Popular Ones 