# A3: Project on complex networks
### Analyzing Spotify Data: Global Patterns of Musical Taste and Artist/Genre Clustering by Country
#### _By Saioa Elizondo, Marc Albesa and Maria Fité_
**Group Name:** A3L

**Objective:** Development of a Project in which you put into practice some of the knowledge acquired
during this course on Complex Networks.

**Kind of project:** Analysis. Obtain real data which can be put in terms of network(s),
and analyze the network(s) to answer any question or hypothesis
about them. It is not enough to calculate descriptors or communities,
you must try to answer relevant questions about the system's
functioning.

## 0. Preparing the enviroment

In [5]:
%pip install spotipy

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Code to import the needed modules/packages to run the notebook
import os
import ast
import pandas as pd # dataframes
import numpy as np # Mathematical operations
import matplotlib.pyplot as plt # Representation
import random as random
import networkx as nx # Network tools
import spotipy
from matplotlib import rcParams
from matplotlib import colors
from matplotlib.lines import Line2D
from matplotlib.patches import FancyArrowPatch
from collections import Counter
from sklearn.cluster import KMeans
from scipy.sparse.linalg import eigsh
from spotipy.oauth2 import SpotifyClientCredentials

# Get workng directory
wd = os.getcwd()
np.random.seed(52)

In [7]:
# Set up credentials
client_id = 'albesa.marc@gmail.com'
client_secret = 'Pauimarc.06'  

# Authenticate
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [8]:
def get_playlist_tracks(playlist_id):
    tracks_data = []
    results = sp.playlist_tracks(playlist_id)
    while results:
        for item in results['items']:
            track = item['track']
            if track:  # Ensure track data exists
                track_info = {
                    'track_name': track['name'],
                    'artist_name': [artist['name'] for artist in track['artists']],
                    'artist_id': [artist['id'] for artist in track['artists']],
                    'album_name': track['album']['name'],
                    'track_popularity': track['popularity'],  # Popularity score (0-100)
                }
                tracks_data.append(track_info)
        # Get next page of tracks (if available)
        results = sp.next(results) if results['next'] else None
    return tracks_data

## 1. Importing data and constructing the network

In [None]:
# Example playlist IDs for top 50 in various countries
playlist_ids = {
    'USA': '37i9dQZEVXbLRQDuF5jeBp',
    'UK': '37i9dQZEVXbLnolsZ8PSNw',
    'India': '37i9dQZEVXbLZ52XmnySJg',
    # Add more countries here...
}

all_data = []

for country, playlist_id in playlist_ids.items():
    print(f"Fetching data for {country}...")
    tracks = get_playlist_tracks(playlist_id)
    for track in tracks:
        for artist_id in track['artist_id']:
            artist_info = get_artist_info(artist_id)
            all_data.append({
                'country': country,
                'track_name': track['track_name'],
                'artist_name': artist_info['artist_name'],
                'followers': artist_info['followers'],
                'genres': artist_info['genres'],
                'track_popularity': track['track_popularity'],
                'artist_popularity': artist_info['popularity'],
            })

# Convert to DataFrame for analysis
df = pd.DataFrame(all_data)

In [None]:
# Reading csv files
edges = pd.read_csv(os.path.join(wd, "network", "edges.csv"))
nodes = pd.read_csv(os.path.join(wd, "network", "nodes.csv"))

# Defining directed graph
G = nx.DiGraph()

# Adding nodes
nodenames = nodes["# index"].values
    
for node in nodenames:
    G.add_node(node, label=node)

# Adding edges
for i in range(len(edges)):
    node1 = edges["# source"][i]
    node2 = edges[" target"][i]
    regulation = int(edges[" regulation_type"][i])
    if regulation == 1:
        G.add_edge(node1,node2, reg_type = "1")
    elif regulation == 2:
        G.add_edge(node2,node1, reg_type = "2")
    elif regulation == 3:
        G.add_edge(node1,node2, reg_type = "3")
        G.add_edge(node2,node1, reg_type = "3")

# If the network is not (weakly) connected, select the largest connected component (LCC) 
# for the rest of the analysis. This LCC must have at least 200 nodes.

# Get all weakly connected components
weak_components = list(nx.weakly_connected_components(G)) 

# Find the largest weakly connected component (LCC)
largest_weak_component = max(weak_components, key=len)
G_lcc = G.subgraph(largest_weak_component).copy()

# Updating nodes and edges for the lcc
nodes_lcc = nodes[ nodes["# index"].isin(G_lcc.nodes())]
edges_lcc = edges[edges['# source'].isin(G_lcc.nodes()) & edges[' target'].isin(G_lcc.nodes())]

# Check connectivity of the largest weakly connected component
print("Largest connected component is weakly connected:",nx.is_weakly_connected(G_lcc))

# Check number of nodes is at least 200
print('The number of nodes is:', G_lcc.number_of_nodes())

In [None]:
df.to_csv('spotify_top50_data.csv', index=False)

In [None]:
df.to_csv('spotify_top50_data.csv', index=False)