In [44]:
import pandas as pd
import networkx as nx
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [30]:
spotify_data = pd.read_csv("Spotify_Dataset_V3.csv", delimiter=";")
df = spotify_data.sample(1000, random_state=42).reset_index(drop=True)


In [50]:
df_latin = df[df["Continent"] == "Latin-America"].copy()

# This is an initially empty collaboration graph
G = nx.Graph()

for _, row in df_latin.iterrows():
    artists = [artist.strip() for artist in row["Artists"].split(",")]

# This means that if artists collaborate on a song, they share an edge. The weight of the edge changes based on how many times certain artists collaborate
    for i in range(len(artists)):
        for j in range(i + 1, len(artists)):
            if G.has_edge(artists[i], artists[j]):
                G[artists[i]][artists[j]]["weight"] += 1
            else:
                G.add_edge(artists[i], artists[j], weight=1)



In [47]:
degree = nx.degree_centrality(G)
eigen = nx.eigenvector_centrality(G, max_iter=500)


In [54]:
rows = []

for i, row in df_latin.iterrows():
    artists = [artist.strip() for artist in row["Artists"].split(",")]

    degree_values = [degree[a] for a in artists if a in degree]
    eigen_values = [eigen[a] for a in artists if a in eigen]

    # This skips an entry if an artist is missing data
    if len(degree_values) != len(artists) or len(eigen_values) != len(artists):
        continue

    rows.append({
        "Index": i,
        "Rank": row["Rank"],
        "AvgDegree": np.mean(degree_values),
        "AvgEigen": np.mean(eigen_values),
        "NumArtists": len(artists)
    })

df = pd.DataFrame(rows)


In [56]:

df["Charts"] = (df["Rank"] <= 50).astype(int)

In [71]:
x = df[["AvgDegree", "AvgEigen", "NumArtists"]]
y = df["Charts"]

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

model = LogisticRegression(max_iter=200)
model.fit(x_scaled, y)

df["ProbChartsAgain"] = model.predict_proba(x_scaled)[:, 1]


In [72]:
# Song probabilities are now assigned to their respective artists
artist_df = df_latin.loc[df["Index"], ["Artists"]].copy()
artist_df["Prob"] = df["ProbChartsAgain"].values

# This separates artists before calculating their overall probability of charting again
artist_df = artist_df.assign(
    Artist = artist_df["Artists"].str.split(",")
).explode("Artist")

artist_df["Artist"] = artist_df["Artist"].str.strip()

artist_probs = (
    artist_df.groupby("Artist")["Prob"]
    .mean()
    .sort_values(ascending=False)
)

In [73]:
top10 = artist_probs.head(10).reset_index()
top10.columns = ["Artist", "Probability of Charting Again"]
print("\nTop 10 Artists by Probability of Charting in the Future:\n")
print(top10.to_string(index=False))



Top 10 Artists by Probability of Charting in the Future:

         Artist  Probability of Charting Again
        Cardi B                       0.340751
       Aventura                       0.332113
         Jhayco                       0.332113
  Bomba EstÃ©reo                       0.332113
      Tony Dize                       0.332113
      Bad Bunny                       0.330353
     Liam Payne                       0.323262
Black Eyed Peas                       0.323262
         Camilo                       0.308745
       J Balvin                       0.305952
