In [3]:
import csv
from collections import defaultdict
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool

# Enable Bokeh output in the notebook
output_notebook()

# Load the CSV file
file_path = 'C:\\Users\\sergi\\Downloads\\Top_Spotify_Songs_in_73_Countries_coord1.csv'

# Function to convert milliseconds to "minutes:seconds"
def format_duration(ms):
    total_seconds = ms / 1000  # Convert milliseconds to seconds
    minutes = int(total_seconds // 60)
    seconds = int(total_seconds % 60)
    return f"{minutes}:{seconds:02d}"  # Format as "m:ss"

# Country mapping dictionary
country_mapping = {
    'EG': 'Egypt',
    'US': 'United States',
    'GB': 'United States',
    'FR': 'France',
    'DE': 'Germany',
    'IT': 'Italy',
    'BR': 'Brazil',
    'SE': 'Sweden',
    'HU': 'Hungary',
    'PH': 'Philippines',
    'NZ': 'New Zealand',
    'KZ': 'Kazakhstan',
    'UA': 'Ukraine',
    'PK': 'Pakistan',
    'RO': 'Romania',
    'LV': 'Latvia',
    'NL': 'Netherlands',
    'BY': 'Belarus'
}

# Dictionary to store unique song names with their maximum danceability scores and associated artist and country
unique_songs = {}

# Read the CSV file
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    # Read and process each row
    for row in reader:
        song_name = row['name']
        artist_name = row['artists']
        country_code = row['country']
        country_name = country_mapping.get(country_code, country_code)  # Get full name or use code
        danceability = float(row['danceability'])
        duration_ms = int(row['duration_ms'])
        popularity = int(row['popularity'])

        # If the song is not in the dictionary or has a higher danceability score, update it
        if song_name not in unique_songs or danceability > unique_songs[song_name]['danceability']:
            unique_songs[song_name] = {
                'artist': artist_name,
                'country': country_name,
                'danceability': danceability,
                'duration_ms': duration_ms,
                'popularity': popularity
            }

# Prepare data for the top 20 songs based on danceability
top_songs = sorted(unique_songs.items(), key=lambda x: x[1]['danceability'], reverse=True)[:20]
song_artist_country_names = [f"{song} by {data['artist']}" for song, data in top_songs]
danceability_scores = [data['danceability'] for _, data in top_songs]
durations = [format_duration(data['duration_ms']) for _, data in top_songs]
popularity_scores = [data['popularity'] for _, data in top_songs]
country_names = [data['country'] for _, data in top_songs]

# Create a color mapping for countries
country_colors = {
    'Egypt': 'red',
    'United States': 'blue',
    'United Kingdom': 'blue',
    'France': 'purple',
    'Germany': 'orange',
    'Italy': 'cyan',
    'Brazil': 'magenta',
    'Sweden': 'yellow',
    'Hungary': 'pink',
    'Philippines': 'brown',
    'New Zealand': 'lightgreen',
    'Kazakhstan': 'lightblue',
    'Ukraine': 'lightgray',
    'Pakistan': 'lightcoral',
    'Romania': 'violet',
    'Latvia': 'gold',
    'Netherlands': 'darkorange',
    'Belarus': 'darkviolet',
}

# Assign colors based on country
colors = [country_colors.get(country, 'gray') for country in country_names]

# Create a ColumnDataSource for Bokeh
source = ColumnDataSource(data={
    'song_artist_country_names': song_artist_country_names,
    'danceability': danceability_scores,
    'duration_ms': durations,
    'popularity': popularity_scores,
    'country_name': country_names,
    'color': colors  # Add color to the source
})

# Create a scatter plot
p = figure(title="Top 20 Songs Based On Danceability In 73 Countries", 
           x_axis_label='Songs, Artists', 
           y_axis_label='Danceability',
           x_range=song_artist_country_names, 
           width=700,  
           height=700)

# Add scatter plot instead of circles
p.scatter(x='song_artist_country_names', y='danceability', source=source, size=10, color='color', alpha=0.7)

# Add hover tool
hover = HoverTool()
hover.tooltips = [
    ("Song Info", "@song_artist_country_names"),
    ("Danceability", "@danceability"),
    ("Duration", "@duration_ms"),
    ("Popularity", "@popularity"),
    ("Country", "@country_name")  # Show full country name instead of initials
]
p.add_tools(hover)

# Rotate the x-axis labels for better visibility
p.xaxis.major_label_orientation = 1

# Show the plot inline in the Jupyter Notebook
show(p)