In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import networkx as nx

import spotify_credentials

In [102]:
"""Get an OAuth token using the Client Credentials Flow."""
token_url = 'https://accounts.spotify.com/api/token'
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
data = {'grant_type': 'client_credentials'}
response = requests.post(token_url, headers=headers, data=data, auth=(spotify_credentials.client_id, spotify_credentials.client_secret))
response.raise_for_status()
token = response.json()['access_token']

In [103]:
def search_artist(artist_name):
    """Search for an artist and return their Spotify ID."""
    search_url = 'https://api.spotify.com/v1/search'
    headers = {'Authorization': f'Bearer {token}'}
    params = {'q': artist_name, 'type': 'artist', 'limit': 1}
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    results = response.json()['artists']['items']
    if results:
        return results[0]['id']
    return None

def get_artist_top_tracks(artist_id, market='US'):
    """Get the top tracks for an artist."""
    url = f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks'
    headers = {'Authorization': f'Bearer {token}'}
    params = {'market': market}
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()['tracks']

def get_track_audio_features(track_id):
    """Get audio features for a track."""
    url = f'https://api.spotify.com/v1/audio-features/{track_id}'
    headers = {'Authorization': f'Bearer {token}'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()

def get_artist_name(artist_id):
    """Get the name of an artist given their Spotify ID."""
    url = f'https://api.spotify.com/v1/artists/{artist_id}'
    headers = {'Authorization': f'Bearer {token}'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()['name']

In [104]:
artist_name = 'Taylor Swift'
search_artist(artist_name)

'06HL4z0CvFAxyc27GXpf02'

## Module 2 - Graph Analysis
In this module, Ryan analyzed connections between different artists on Spotify. From this analysis, we will used supervised learning to predict whether a song will be a hit or not. 

https://medium.com/inst414-data-science-tech/a-groovy-network-analysis-how-features-on-rap-songs-benefit-everyone-50fda2bb91ee

In [105]:
# List of selected artists
artists = [
    "Westside Gunn", "Conway the Machine", "Freddie Gibbs", "Benny the Butcher", "Larry June", "Roc Marciano"]

g = nx.Graph()

if __name__ == '__main__':
    
    # Loop through each artist, get their Spotify ID and top tracks
    for artist in artists:
        artist_id = search_artist(artist)
        if artist_id is None:
            print(f"Could not find Spotify data for {artist}")
            continue
        
        print(f"\nArtist: {artist} (ID: {artist_id})")
        top_tracks = get_artist_top_tracks(artist_id)
        
        # For each track, add nodes for every artist and create edges between them
        for track in top_tracks:
            track_name = track['name']
            # Each track contains a list of artists (each a dict with 'id' and 'name')
            track_artists = track['artists']
            
            # Add every artist as a node (using their Spotify ID as the unique identifier)
            for a in track_artists:
                g.add_node(a['id'], name=a['name'])
            
            # Create an edge between every pair of artists on the track
            for i in range(len(track_artists)):
                for j in range(i + 1, len(track_artists)):
                    a_id = track_artists[i]['id']
                    b_id = track_artists[j]['id']
                    
                    # If an edge already exists, increment the weight; otherwise, add a new edge with weight 1.
                    if g.has_edge(a_id, b_id):
                        g[a_id][b_id]['weight'] += 1
                    else:
                        g.add_edge(a_id, b_id, weight=1)
                    
                    print(f"{track_artists[i]['name']} <-> {track_artists[j]['name']} on '{track_name}'")



Artist: Westside Gunn (ID: 0ABk515kENDyATUdpCKVfW)
Westside Gunn <-> Doechii on 'EGYPT - Remix'
Travis Scott <-> Westside Gunn on 'LOST FOREVER (feat. Westside Gunn)'
Westside Gunn <-> Joey Bada$$ on '327 (feat. Tyler, The Creator & Billie Essco)'
Westside Gunn <-> Tyler, The Creator on '327 (feat. Tyler, The Creator & Billie Essco)'
Westside Gunn <-> Billie Essco on '327 (feat. Tyler, The Creator & Billie Essco)'
Joey Bada$$ <-> Tyler, The Creator on '327 (feat. Tyler, The Creator & Billie Essco)'
Joey Bada$$ <-> Billie Essco on '327 (feat. Tyler, The Creator & Billie Essco)'
Tyler, The Creator <-> Billie Essco on '327 (feat. Tyler, The Creator & Billie Essco)'
Statik Selektah <-> Westside Gunn on 'The Louvre'
Statik Selektah <-> Joey Bada$$ on 'The Louvre'
Statik Selektah <-> Rome Streetz on 'The Louvre'
Statik Selektah <-> Stove God Cooks on 'The Louvre'
Westside Gunn <-> Joey Bada$$ on 'The Louvre'
Westside Gunn <-> Rome Streetz on 'The Louvre'
Westside Gunn <-> Stove God Cooks on

In [106]:
# Turn the graph into a DataFrame
edges = g.edges(data=True)
edge_list = [(u, v, d['weight']) for u, v, d in edges]
collaboration_df = pd.DataFrame(edge_list, columns=['source', 'target', 'weight'])
collaboration_df

Unnamed: 0,source,target,weight
0,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,1
1,0ABk515kENDyATUdpCKVfW,0Y5tJX1MQlPlqiwlOH1tJY,1
2,0ABk515kENDyATUdpCKVfW,2P5sC9cVZDToPxyomzF1UH,3
3,0ABk515kENDyATUdpCKVfW,4V8LLVI7PbaPR0K2TGSxFF,1
4,0ABk515kENDyATUdpCKVfW,53IIk03HWWJ2Z7QwQ9yFHh,2
...,...,...,...
71,5oifjQw72WO7Jut07fVWMy,6qgnBH6iDM91ipVXv28OMu,1
72,3Gm5F95VdRxW3mqCn8RPBJ,6qgnBH6iDM91ipVXv28OMu,1
73,4fpwOzxFRMVGfd197dKIdY,3A5tHz1SfngyOZM2gItYKu,1
74,1grN0519h2zYqpRtYbDZAl,7c0XG5cIJTrrAgEC3ULPiq,1


---

## Module 6 - Supervised Learning

In [107]:
artist_track_popularity_df = pd.DataFrame()
index = 0
for artist in artists:
    artist_id = search_artist(artist)
    top_tracks = get_artist_top_tracks(artist_id)
    # For each track, get the track ID, name, and popularity and create new rows in the DataFrame
    for track in top_tracks:
        track_attr ={
                "artist_id": artist_id,
                "artist_name": artist,
                "track_id": track['id'],
                "track_name": track['name'],
                "popularity": track['popularity']
            }
        artist_track_popularity_df[index] = track_attr  
        index += 1
artist_track_popularity_df = artist_track_popularity_df.transpose()
artist_track_popularity_df

Unnamed: 0,artist_id,artist_name,track_id,track_name,popularity
0,0ABk515kENDyATUdpCKVfW,Westside Gunn,61b65LQ9rxhcMtDEusHPZL,EGYPT - Remix,67
1,0ABk515kENDyATUdpCKVfW,Westside Gunn,7EiUtdeoWcKqLi5ELZCjYf,LOST FOREVER (feat. Westside Gunn),66
2,0ABk515kENDyATUdpCKVfW,Westside Gunn,5sxRbu2Oi9lgmLO8taA3Rf,"327 (feat. Tyler, The Creator & Billie Essco)",64
3,0ABk515kENDyATUdpCKVfW,Westside Gunn,54blTc3AnAVPehzb7u3TDs,The Louvre,63
4,0ABk515kENDyATUdpCKVfW,Westside Gunn,5yD5MO0jgjIfKPXanAtRi6,EGYPT,62
5,0ABk515kENDyATUdpCKVfW,Westside Gunn,33YwdmM6I1hBOT0ZQHSQW6,Freestyle,62
6,0ABk515kENDyATUdpCKVfW,Westside Gunn,1zBPkwg2oEh760w20qbJ9E,Why I do em Like that (feat. Billie Essco),62
7,0ABk515kENDyATUdpCKVfW,Westside Gunn,6dA7M8QPaOH5nvsH1qf5DF,EINSTEIN KITCHEN,60
8,0ABk515kENDyATUdpCKVfW,Westside Gunn,1SJnJJmrqRyE8YSdhEtbPv,Brand New 911,59
9,0ABk515kENDyATUdpCKVfW,Westside Gunn,68xjJPxtRpCKxtrTtRYVF9,$500 Ounces (feat. Freddie Gibbs & Roc Marciano),59


In [108]:
popularity_collaboration_df = pd.merge(
    collaboration_df,
    artist_track_popularity_df,
    left_on='source',
    right_on='artist_id',
    how='left'
).drop(columns=['artist_id','weight'])
popularity_collaboration_df.rename(columns={'source':'source_artist_id','target':'collab_artist_id','artist_name': 'source_artist', 'track_id': 'source_track_id', 'track_name': 'source_track_name', 'popularity': 'source_popularity'}, inplace=True)
popularity_collaboration_df = popularity_collaboration_df.drop_duplicates()
popularity_collaboration_df

Unnamed: 0,source_artist_id,collab_artist_id,source_artist,source_track_id,source_track_name,source_popularity
0,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,Westside Gunn,61b65LQ9rxhcMtDEusHPZL,EGYPT - Remix,67
1,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,Westside Gunn,7EiUtdeoWcKqLi5ELZCjYf,LOST FOREVER (feat. Westside Gunn),66
2,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,Westside Gunn,5sxRbu2Oi9lgmLO8taA3Rf,"327 (feat. Tyler, The Creator & Billie Essco)",64
3,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,Westside Gunn,54blTc3AnAVPehzb7u3TDs,The Louvre,63
4,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,Westside Gunn,5yD5MO0jgjIfKPXanAtRi6,EGYPT,62
...,...,...,...,...,...,...
512,1grN0519h2zYqpRtYbDZAl,137W8MRPWKqSmrBGDBFSop,Larry June,1qUyn3s2q7ltOZ5HntTT8K,Cashed Out - feat. Larry June,62
513,1grN0519h2zYqpRtYbDZAl,137W8MRPWKqSmrBGDBFSop,Larry June,0LH7ijaQi0ybY6Oe4SCLo1,Watering My Plants,62
514,1grN0519h2zYqpRtYbDZAl,137W8MRPWKqSmrBGDBFSop,Larry June,499OiYFGkuTO74jb9tKv8Z,Bad Choices,59
515,1grN0519h2zYqpRtYbDZAl,137W8MRPWKqSmrBGDBFSop,Larry June,0iZNw4AkFwcglAsvaTLfmK,Ocean Sounds,61


In [114]:
# This table show for each artist, the average popularity of their collaborations
popularity_collaboration_df = popularity_collaboration_df.groupby(['source_artist_id', 'collab_artist_id']).agg({'source_popularity': 'mean'}).reset_index()
popularity_collaboration_df = popularity_collaboration_df.drop_duplicates()
popularity_collaboration_df

Unnamed: 0,source_artist_id,collab_artist_id,source_popularity
0,0ABk515kENDyATUdpCKVfW,0Y4inQK6OespitzD6ijMwb,62.4
1,0ABk515kENDyATUdpCKVfW,0Y5tJX1MQlPlqiwlOH1tJY,62.4
2,0ABk515kENDyATUdpCKVfW,0eVyjRhzZKke2KFYTcDkeu,62.4
3,0ABk515kENDyATUdpCKVfW,2Fsw6Hh9wWybcZPZZEGYPj,62.4
4,0ABk515kENDyATUdpCKVfW,2P5sC9cVZDToPxyomzF1UH,62.4
5,0ABk515kENDyATUdpCKVfW,35fcckhFq2cF2u7hIG0fPv,62.4
6,0ABk515kENDyATUdpCKVfW,4E2rKHVDssGJm2SCDOMMJB,62.4
7,0ABk515kENDyATUdpCKVfW,4V8LLVI7PbaPR0K2TGSxFF,62.4
8,0ABk515kENDyATUdpCKVfW,53IIk03HWWJ2Z7QwQ9yFHh,62.4
9,0ABk515kENDyATUdpCKVfW,5Matrg5du62bXwer29cU5T,62.4


In [118]:
# Perform logistic regression to predict the popularity of a collaboration
X = popularity_collaboration_df[['source_popularity']]
y = popularity_collaboration_df['collab_artist_id']
model = LogisticRegression()
pred = model.fit(X, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [121]:
def predict_collab_popularity(artist_id_1, artist_id_2):
    artist_name_1 = get_artist_name(artist_id_1)
    artist_name_2 = get_artist_name(artist_id_2)
    print(f"Predicting collaboration between {artist_name_1} and {artist_name_2}...")
    # Get the source popularity of the first artist
    source_popularity = popularity_collaboration_df.loc[
        popularity_collaboration_df['source_artist_id'] == artist_id_1, 'source_popularity'
    ].astype(float).mean()

    if np.isnan(source_popularity):
        raise ValueError(f"Artist ID {artist_id_1} not found in the dataset.")

    # Prepare the input for prediction
    input_data = pd.DataFrame({'source_popularity': [source_popularity]})

    # Predict the collab artist ID
    predicted_collab_artist_id = model.predict(input_data)
    print(f"Predicted collaboration artist IDs: {predicted_collab_artist_id}")
  
    if artist_id_2 in predicted_collab_artist_id:
        return f"The predicted collaboration popularity score is {source_popularity}."
    else:
        return f"The model does not predict a collaboration between {artist_name_1} and {artist_name_2}."

# Example usage
artist_id_1 = '0Y4inQK6OespitzD6ijMwb'  # Replace with the first artist ID
artist_id_2 = '0Y5tJX1MQlPlqiwlOH1tJY'  # Replace with the second artist ID
predict_collab_popularity(artist_id_1, artist_id_2)

Predicting collaboration between Freddie Gibbs and Travis Scott...
Predicted collaboration artist IDs: ['67gqUXxHedeUGDTxwBzdjS']


'The model does not predict a collaboration between Freddie Gibbs and Travis Scott.'