# Data Explore

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import seaborn as sns

## Cleaned Data Import

In [None]:
df_track = pd.read_csv('data/track.csv')
df_album = pd.read_csv('data/album.csv')
df_artist = pd.read_csv('data/artist.csv')
df_rating = pd.read_csv('data/rating.csv')
df_similar = pd.read_csv('data/track_similarity.csv')
df_weekly_rating = pd.read_csv('data/weekly_rating.csv')

## Defining Useful functions

In [None]:
# ======================= Convert Names to IDs =============================

def convert_artist_name_to_id(df_artist, artist_name):
    return df_artist[df_artist['name'] == artist_name]['artist_id'].values[0]

def convert_album_name_to_id(album_name):
    return df_album[df_album['name'] == album_name]['album_id'].values[0]

def convert_track_name_to_id(track_name):
    return df_track[df_track['name'] == track_name]['track_id'].values[0]

# ======================= Convert Ids to Names =============================

def convert_track_id_to_name(track_id):
    return df_track[df_track['track_id'] == track_id]['name'].values[0]

def convert_album_id_to_name(album_id):
    return df_album[df_album['album_id'] == album_id]['name'].values[0]

def convert_artist_id_to_name(artist_id):
    return df_artist[df_artist['artist_id'] == artist_id]['name'].values[0]

# ======================= Others =============================

def get_all_artist_albums(artist_name, byName=False):
    all_albums = df_artist[df_artist['name'] == artist_name]['all_albums'].values[0]
    all_albums = all_albums[1:-1].split(',')
    all_albums = [int(album) for album in all_albums]

    if byName:
        all_albums = [convert_album_id_to_name(album) for album in all_albums]

    return all_albums

def get_all_artist_tracks(artist_name, byName=False):
    all_tracks = df_artist[df_artist['name'] == artist_name]['all_tracks'].values[0]
    all_tracks = all_tracks[1:-1].split(',')
    all_tracks = [int(track) for track in all_tracks]

    all_tracks = [convert_track_id_to_name(track) for track in all_tracks]

    return all_tracks

## Exploration

In [None]:
# number of different artists
artist_count = len(df_artist)
print("Number of different artists:", artist_count) 

# number of different albums
album_count = len(df_album)
print("Number of different albums:", album_count)

# number of different tracks
track_count = len(df_track)
print("Number of different tracks:", track_count)

# mean number of tracks per album
mean_tracks_per_album = round(track_count / album_count, 1)
print("\nMean number of tracks per album:", mean_tracks_per_album)

# mean number of albums per artist
mean_albums_per_artist = round(album_count / artist_count, 1)   
print("Mean number of albums per artist:", mean_albums_per_artist)

# mean number of tracks per artist
mean_tracks_per_artist = round(track_count / artist_count, 1)
print("Mean number of tracks per artist:", mean_tracks_per_artist)

### Artists

In [None]:
# Filter rows containing 'bieber', handling NaN values
bieber_artists = df_artist[df_artist['name'].str.contains('bieber', na=False, case=False)]

# Display the results
print('Artists containing "Bieber":\n', bieber_artists)
print('Number of artists containing "Bieber":', len(bieber_artists))

In [None]:
# how many of the artist have either "&" or "and" or "featuring" in their name

# Filter rows containing '&', 'and' or 'featuring', handling NaN values
and_artists = df_artist[df_artist['name'].str.contains('&|featuring', na=False)]

# Display the results

print('Artists containing "&", "and" or "featuring":\n', and_artists)
print('Number of artists containing "&", "and" or "featuring":', len(and_artists))

# number of different track_ids in the track_similarity table because there are duplicates

# Get the number of unique track_ids in the track_similarity table
unique_track_ids = df_similar[['track_id_1', 'track_id_2']].stack().unique()
print('Number of unique track_ids in the track_similarity table:', len(unique_track_ids))

### Tracks & Albums

In [None]:
# get all tracks of an artist
artist_name = 'Taylor Swift'

# get all albums of an artist
all_albums = get_all_artist_albums(artist_name, byName=True)
all_tracks = get_all_artist_tracks(artist_name, byName=True)

print(f"\nAlbums by {artist_name}:")
for album in all_albums:
    print(album)

print(f"\n====================\nTracks by {artist_name}:")
for track in all_tracks:
    print(track)

In [None]:
# print the 20 artists with the most tracks
artist_track_count = df_track.groupby('artist_id').size().reset_index(name='track_count')
artist_track_count = artist_track_count.sort_values(by='track_count', ascending=False)

# make a distribution of the number of tracks per artist
plt.hist(artist_track_count['track_count'], bins=50)
plt.xlabel('Number of tracks')
plt.ylabel('Number of artists')
plt.yscale('log')
plt.title('Distribution of the number of tracks per artist')

artist_track_count = artist_track_count.head(20)
artist_track_count['artist_name'] = artist_track_count['artist_id'].apply(convert_artist_id_to_name)
print("Top 20 artists by number of tracks:\n" + str(artist_track_count))

In [None]:
# number of tracks per album
album_track_count = df_track.groupby('album_id').size().reset_index(name='track_count')
album_track_count = album_track_count.sort_values(by='track_count', ascending=False)

# make a distribution of the number of tracks per album
plt.hist(album_track_count['track_count'], bins=50)
plt.xlabel('Number of tracks')
plt.ylabel('Number of albums')
plt.yscale('log')
plt.title('Distribution of the number of tracks per album')

album_track_count = album_track_count.head(20)
album_track_count['album_name'] = album_track_count['album_id'].apply(convert_album_id_to_name)
print("Top 20 albums by number of tracks:\n" + str(album_track_count))


### Similarity

In [None]:
def compare_track_lists_sim(track_id_list_1, track_id_list_2):

    ret_list = []

    for main_track_id in track_id_list_1:
        for other_track_id in track_id_list_2:    
            
            similarity = df_similar[(df_similar['track_id_1'] == main_track_id) & (df_similar['track_id_2'] == other_track_id)]
            
            if not similarity.empty:
                name1 = convert_track_id_to_name(main_track_id)
                name2 = convert_track_id_to_name(other_track_id)
                ret_list.append((name1, name2, float(similarity.iloc[0]["sim_degree"])))

            similarity = df_similar[(df_similar['track_id_2'] == main_track_id) & (df_similar['track_id_1'] == other_track_id)]
            
            if not similarity.empty:
                name1 = convert_track_id_to_name(main_track_id)
                name2 = convert_track_id_to_name(other_track_id)
                ret_list.append((name1, name2, float(similarity.iloc[0]["sim_degree"])))

    ret = sorted(ret_list, key=lambda x: x[2], reverse=True)
    return ret

In [None]:
# Compare all artist_id 1 tracks with all artist_id 2 tracks using track_similarity

artist_1 = 'Justin Bieber'
artist_2 = 'Calvin Harris'

artist_1_tracks_ids = get_all_artist_tracks(artist_1)
artist_2_tracks_ids = get_all_artist_tracks(artist_2)

print(f'Tracks by {artist_1}: {len(artist_1_tracks_ids)}')
print(f'Tracks by {artist_2}: {len(artist_2_tracks_ids)}\n')

# Get the similarity between all tracks by artist 1 and all tracks by the artist 2
result = compare_track_lists_sim(artist_1_tracks_ids, artist_2_tracks_ids) 
for r in result:
    print(r)

if len(result) == 0:
    print('No similarities found between the tracks of the two artists')

In [None]:
# print all track ids that do not have a similarity value
temp_list = []
track_ids = df_track['track_id'].to_list()
for track_id in track_ids:
    if df_similar[(df_similar['track_id_1'] == track_id) | (df_similar['track_id_2'] == track_id)].empty:
        temp_list.append(track_id)

print('Track ids without similarity value:', f"{temp_list[:10]} ..." if len(temp_list) > 10 else temp_list)
print('Number of track ids without similarity value:', len(temp_list))
print('Percentage of track ids without similarity value:', round(len(temp_list) / track_count * 100, 2), '%')

In [None]:
# print df_similar
print(df_similar)

In [None]:
# Import necessary libraries
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community

# Create a graph
G = nx.Graph()

df_similar_small = df_similar.head(500)

# Add edges with weights
for _, row in df_similar_small.iterrows():
    G.add_edge(row['track_id_1'], row['track_id_2'], weight=row['sim_degree'])

# Set up figure size and resolution BEFORE drawing anything
plt.figure(figsize=(40, 40), dpi=500)

# Layout for the graph
pos = nx.spring_layout(G)

# Find communities
communities = community.greedy_modularity_communities(G)

# Assign colors to each community
node_colors = {}
for idx, comm in enumerate(communities):
    for node in comm:
        node_colors[node] = idx

# Draw nodes with community colors
nx.draw_networkx_nodes(
    G, pos,
    node_size=500,
    node_color=[node_colors[node] for node in G.nodes()]
)

# Draw edges with thickness based on weight
nx.draw_networkx_edges(
    G,
    pos,
    edgelist=G.edges(data=True),
    width=[edge[2]['weight'] * 5 for edge in G.edges(data=True)]  # Scale the thickness
)

# Draw labels
nx.draw_networkx_labels(G, pos, font_size=5, font_color="black")

# Display the graph
plt.title("Song Similarity Graph")
plt.show()


In [None]:
# create a new dataframe where, instead of df_similar track_id, we have the authors names
df_similar_artists = df_similar.copy()
df_similar_artists['artist_id_1'] = df_similar_artists['track_id_1'].apply(lambda x: df_track[df_track['track_id'] == x]['artist_id'].values[0])
df_similar_artists['artist_id_2'] = df_similar_artists['track_id_2'].apply(lambda x: df_track[df_track['track_id'] == x]['artist_id'].values[0])
df_similar_artists['artist_name_1'] = df_similar_artists['artist_id_1'].apply(lambda x: df_artist[df_artist['artist_id'] == x]['name'].values[0])
df_similar_artists['artist_name_2'] = df_similar_artists['artist_id_2'].apply(lambda x: df_artist[df_artist['artist_id'] == x]['name'].values[0])

# print the new dataframe
print(df_similar_artists)





In [None]:
# only keep the sim_degree, artist_name_1 and artist_name_2 columns

df_similar_artists = df_similar_artists[['sim_degree', 'artist_name_1', 'artist_name_2']]

# move sim_degree to the end
df_similar_artists = df_similar_artists[['artist_name_1', 'artist_name_2', 'sim_degree']]

In [None]:
df_similar_artists.head()

In [None]:
# remove df_similar_artists entries where artist_name_1 == artist_name_2
df_similar_artists = df_similar_artists[df_similar_artists['artist_name_1'] != df_similar_artists['artist_name_2']]
df_similar_artists.head()


In [None]:
# Import necessary libraries
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community

# Create a graph
G = nx.Graph()

# Use artist dataframe
df_similar_artists_small = df_similar_artists.head(100)  # Adjust for your actual dataframe

# only keep rows with sim_degree == 1
df_similar_artists_1 = df_similar_artists[df_similar_artists['sim_degree'] == 1]

# Add edges with weights
for _, row in df_similar_artists.iterrows():
    G.add_edge(row['artist_name_1'], row['artist_name_2'], weight=row['sim_degree'])

# Set up figure size and resolution BEFORE drawing anything
plt.figure(figsize=(20, 20), dpi=300)

# Layout for the graph
pos = nx.spring_layout(G, k=0.7, seed=42)  # Adjust k for spacing

# Find communities
communities = community.greedy_modularity_communities(G)

# Assign colors to each community
node_colors = {}
for idx, comm in enumerate(communities):
    for node in comm:
        node_colors[node] = idx

# draw node size based on degree
node_size = [G.degree(node) * 100 for node in G.nodes()]

# Draw nodes with community colors
nx.draw_networkx_nodes(
    G, pos,
    node_size=node_size,
    node_color=[node_colors[node] for node in G.nodes()]
)


# Draw edges with thickness based on weight
nx.draw_networkx_edges(
    G,
    pos,
    edgelist=G.edges(data=True),
    width=[edge[2]['weight'] * 2.5 for edge in G.edges(data=True)]  # Scale the thickness
)

# Draw labels
nx.draw_networkx_labels(G, pos, font_size=8, font_color="gray")

# Display the graph
plt.title("Artist Similarity Graph")
plt.show()


In [None]:
# make a dataframe of the artist name to number of entries in the df_similar_artists dataframe
artist_count = df_similar_artists['artist_name_1'].value_counts().reset_index()
artist_count.columns = ['artist_name', 'count']
artist_count.head()



## 1 Year Analysis
In order to be able to consider data from all three sources, we will analyze the interval between 01/05/2013 and 01/05/2014. For epoch analysis between 29/04/2013 and 04/05/2014, which means weeks 487 to 539

In [None]:
# Get the year dataframe, which is from weekly_rating, get all rows that have time_epoch >= 487 and time_epoch <= 539
df_year = df_weekly_rating[(df_weekly_rating['time_epoch'] >= 487) & (df_weekly_rating['time_epoch'] <= 539)]
df_year.shape

In [None]:
# From the yearly dataframe, lets analyze first a single track, id 428
df_year_single = df_year[df_year['track_id'] == 428]
df_year_single.shape

# Plot the track rating over the year, need three lines, one for position_billboard, other for position_spotify and other for position_lastfm, invert the y axis to show the rating from 1 to 100
plt.figure(figsize=(10, 5))
plt.plot(df_year_single['time_epoch'], df_year_single['position_billboard'], label='Billboard')
plt.plot(df_year_single['time_epoch'], df_year_single['position_spotify'], label='Spotify')
plt.plot(df_year_single['time_epoch'], df_year_single['position_lastfm'], label='LastFM')
plt.xlabel('Time')
plt.ylabel('Position')
plt.title('Track 428')
plt.gca().invert_yaxis()
plt.legend()
plt.show()

In [None]:
# Detect which track has the highest correlation between the three positions, using spearman correlation, and only using rows where both positions are not NaN
# Get the track ids
track_ids = df_year['track_id'].unique()

# Create a list to store the correlations
correlations = []

# Iterate over the track ids
for track_id in track_ids:
    # Get the track data
    df_year_single = df_year[df_year['track_id'] == track_id]
    billboard_spotify_filtered = df_year_single.dropna(subset=['position_billboard','position_spotify'],how='any')
    billboard_lastfm_filtered = df_year_single.dropna(subset=['position_billboard','position_lastfm'],how='any')
    spotify_lastfm_filtered = df_year_single.dropna(subset=['position_spotify','position_lastfm'],how='any')
    # Calculate the correlation between the positions, but use only the rows where both positions are not NaN
    corr_billboard_spotify, _ = spearmanr(billboard_spotify_filtered['position_billboard'], billboard_spotify_filtered['position_spotify'])
    corr_billboard_lastfm, _ = spearmanr(billboard_lastfm_filtered['position_billboard'], billboard_lastfm_filtered['position_lastfm'])
    corr_spotify_lastfm, _ = spearmanr(spotify_lastfm_filtered['position_spotify'], spotify_lastfm_filtered['position_lastfm'])
    # Save the correlations
    correlations.append((track_id, corr_billboard_spotify, corr_billboard_lastfm, corr_spotify_lastfm))

# Replace NaN values in each correlation with 0
correlations = [(track_id, corr_billboard_spotify if not np.isnan(corr_billboard_spotify) else 0, corr_billboard_lastfm if not np.isnan(corr_billboard_lastfm) else 0, corr_spotify_lastfm if not np.isnan(corr_spotify_lastfm) else 0) for track_id, corr_billboard_spotify, corr_billboard_lastfm, corr_spotify_lastfm in correlations]

# Sort the correlations for billboard and spotify
correlations = sorted(correlations, key=lambda x: x[1], reverse=True)
# Get the track with the highest correlation
best_track_id, corr_billboard_spotify, corr_billboard_lastfm, corr_spotify_lastfm = correlations[0]
# Print the results
print(f'Track with highest correlation between Billboard and Spotify: {convert_track_id_to_name(best_track_id), corr_billboard_spotify}')

# Sort the correlations for billboard and lastfm
correlations = sorted(correlations, key=lambda x: x[2], reverse=True)
# Get the track with the highest correlation
best_track_id, corr_billboard_spotify, corr_billboard_lastfm, corr_spotify_lastfm = correlations[0]
# Print the results
print(f'Track with highest correlation between Billboard and LastFM: {convert_track_id_to_name(best_track_id), corr_billboard_lastfm}')

# Sort the correlations for spotify and lastfm
correlations = sorted(correlations, key=lambda x: x[3], reverse=True)
# Get the track with the highest correlation
best_track_id, corr_billboard_spotify, corr_billboard_lastfm, corr_spotify_lastfm = correlations[0]
# Print the results
print(f'Track with highest correlation between Spotify and LastFM: {convert_track_id_to_name(best_track_id), corr_spotify_lastfm}')

In [None]:
# In yearly data, identify which track_ids have 53 entries
track_id = df_year['track_id'].value_counts()
track_id = track_id[track_id == 53].index
track_id = track_id.tolist()
print('Track ids with 53 entries:', track_id)

# Get the tracks names
track_names = [convert_track_id_to_name(track) for track in track_id]
print('Track names with 53 entries:', track_names)

# Get the values on the columns position_billboard, position_spotify and position_lastfm for the track with 53 entries
df_year_53 = df_year[df_year['track_id'].isin(track_id)]
df_year_53 = df_year_53[['track_id','time_epoch' , 'position_billboard', 'position_spotify', 'position_lastfm']]

# Plot the evolution of the three positions for the track with 53 entries
plt.figure(figsize=(10, 5))
for track in track_id:
    df_year_single = df_year_53[df_year_53['track_id'] == track]
    plt.plot(df_year_single['time_epoch'], df_year_single['position_billboard'], label='Billboard')
    plt.plot(df_year_single['time_epoch'], df_year_single['position_spotify'], label='Spotify')
    plt.plot(df_year_single['time_epoch'], df_year_single['position_lastfm'], label='LastFM')
    plt.xlabel('Time')
    plt.ylabel('Position')
    plt.title(f'Track {convert_track_id_to_name(track)} - Id of {track}')
    plt.gca().invert_yaxis()
    plt.legend()
    plt.show()






### Billboard Top 100 Consecutive Weeks Per Track

In [None]:
# Create a list to store the results
billboard_top100_weeks = []

# Iterate over the track ids
for track_id in track_ids:
    # Get the track data
    df_year_single = df_year[df_year['track_id'] == track_id]
    # Sort by time_epoch to ensure chronological order
    df_year_single = df_year_single.sort_values('time_epoch')
    # Filter out rows without a position in the Billboard top 100
    df_year_single = df_year_single.dropna(subset=['position_billboard'])
    
    if df_year_single.empty:
        continue

    # Calculate the number of consecutive weeks
    weeks = 0
    max_consecutive_weeks = 0
    last_epoch = None
    starting_week_max = 0
    ending_week_max = 0
    starting_pos_max = 0
    ending_pos_max = 0

    for _, row in df_year_single.iterrows():
        current_epoch = row['time_epoch']
        if last_epoch is None or current_epoch - last_epoch == 1:
            # If consecutive, increment the counter
            weeks += 1
        else:
            # If not consecutive, reset the counter
            weeks = 1
        # Update the maximum consecutive weeks
        if weeks > max_consecutive_weeks:
            max_consecutive_weeks = weeks
            starting_week_max = current_epoch - weeks + 1
            ending_week_max = current_epoch
            starting_pos_max = df_year_single[df_year_single['time_epoch'] == starting_week_max]['position_billboard'].values[0]
            ending_pos_max = df_year_single[df_year_single['time_epoch'] == ending_week_max]['position_billboard'].values[0]
        last_epoch = current_epoch

    # Get the first and last positions of the track
    first_position = df_year_single['position_billboard'].iloc[0]
    last_position = df_year_single['position_billboard'].iloc[-1]

    # Save the results
    billboard_top100_weeks.append((track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max))

# Sort the results by the number of consecutive weeks
billboard_top100_weeks = sorted(billboard_top100_weeks, key=lambda x: x[3], reverse=True)

# Print the results
print('Tracks with the highest number of consecutive weeks in the Billboard top 100:')
#for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in billboard_top100_weeks:
#    if max_consecutive_weeks > 1:
#        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')

print('Tracks that did not start on the first week:')
for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in billboard_top100_weeks:
    if starting_week_max != 487 and max_consecutive_weeks > 1:
        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')



### Last.fm Top 100 Consecutive Weeks Per Track

In [None]:
lastfm_top100_weeks = []

# Iterate over the track ids

for track_id in track_ids:
    # Get the track data
    df_year_single = df_year[df_year['track_id'] == track_id]
    # Sort by time_epoch to ensure chronological order
    df_year_single = df_year_single.sort_values('time_epoch')
    # Filter out rows without a position in the LastFM top 100
    df_year_single = df_year_single.dropna(subset=['position_lastfm'])
    
    if df_year_single.empty:
        continue

    # Calculate the number of consecutive weeks
    weeks = 0
    max_consecutive_weeks = 0
    last_epoch = None
    starting_week_max = 0
    ending_week_max = 0
    starting_pos_max = 0
    ending_pos_max = 0

    for _, row in df_year_single.iterrows():
        current_epoch = row['time_epoch']
        if last_epoch is None or current_epoch - last_epoch == 1:
            # If consecutive, increment the counter
            weeks += 1
        else:
            # If not consecutive, reset the counter
            weeks = 1
        # Update the maximum consecutive weeks
        if weeks > max_consecutive_weeks:
            max_consecutive_weeks = weeks
            starting_week_max = current_epoch - weeks + 1
            ending_week_max = current_epoch
            starting_pos_max = df_year_single[df_year_single['time_epoch'] == starting_week_max]['position_lastfm'].values[0]
            ending_pos_max = df_year_single[df_year_single['time_epoch'] == ending_week_max]['position_lastfm'].values[0]
        last_epoch = current_epoch

    # Get the first and last positions of the track
    first_position = df_year_single['position_lastfm'].iloc[0]
    last_position = df_year_single['position_lastfm'].iloc[-1]

    # Save the results
    lastfm_top100_weeks.append((track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max))

# Sort the results by the number of consecutive weeks
lastfm_top100_weeks = sorted(lastfm_top100_weeks, key=lambda x: x[3], reverse=True)

# Print the results
print('Tracks with the highest number of consecutive weeks in the LastFM top 100:')
#for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in lastfm_top100_weeks:
#    if max_consecutive_weeks > 1:
#        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')

print('Tracks that did not start on the first week:')

for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in lastfm_top100_weeks:
    if starting_week_max != 487 and max_consecutive_weeks > 1:
        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')

### Spotify Top 100 Consecutive Weeks Per Track

In [None]:
spotify_top100_weeks = []

# Iterate over the track ids

for track_id in track_ids:
    # Get the track data
    df_year_single = df_year[df_year['track_id'] == track_id]
    # Sort by time_epoch to ensure chronological order
    df_year_single = df_year_single.sort_values('time_epoch')
    # Filter out rows without a position in the Spotify top 100
    df_year_single = df_year_single.dropna(subset=['position_spotify'])
    
    if df_year_single.empty:
        continue

    # Calculate the number of consecutive weeks
    weeks = 0
    max_consecutive_weeks = 0
    last_epoch = None
    starting_week_max = 0
    ending_week_max = 0
    starting_pos_max = 0
    ending_pos_max = 0

    for _, row in df_year_single.iterrows():
        current_epoch = row['time_epoch']
        if last_epoch is None or current_epoch - last_epoch == 1:
            # If consecutive, increment the counter
            weeks += 1
        else:
            # If not consecutive, reset the counter
            weeks = 1
        # Update the maximum consecutive weeks
        if weeks > max_consecutive_weeks:
            max_consecutive_weeks = weeks
            starting_week_max = current_epoch - weeks + 1
            ending_week_max = current_epoch
            starting_pos_max = df_year_single[df_year_single['time_epoch'] == starting_week_max]['position_spotify'].values[0]
            ending_pos_max = df_year_single[df_year_single['time_epoch'] == ending_week_max]['position_spotify'].values[0]
        last_epoch = current_epoch

    # Get the first and last positions of the track
    first_position = df_year_single['position_spotify'].iloc[0]
    last_position = df_year_single['position_spotify'].iloc[-1]

    # Save the results
    spotify_top100_weeks.append((track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max))

# Sort the results by the number of consecutive weeks
spotify_top100_weeks = sorted(spotify_top100_weeks, key=lambda x: x[3], reverse=True)

# Print the results
print('Tracks with the highest number of consecutive weeks in the Spotify top 100:')
#for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in spotify_top100_weeks:
#    if max_consecutive_weeks > 1:
#        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')

print('Tracks that did not start on the first week:')
for track_id, first_position, last_position, max_consecutive_weeks, starting_week_max, ending_week_max, starting_pos_max, ending_pos_max in spotify_top100_weeks:
    if starting_week_max != 487 and max_consecutive_weeks > 1:
        print(f'Track: {convert_track_id_to_name(track_id)} of id {track_id} - First Position: {first_position} - Last Position: {last_position} - Max Consecutive Weeks: {max_consecutive_weeks} - Starting Week: {starting_week_max} - Ending Week: {ending_week_max} - Starting Position: {starting_pos_max} - Ending Position: {ending_pos_max}')