In [138]:
import pandas as pd
import ast

In [139]:
# Step 1: Load clustered dataset
df = pd.read_csv("dataset_clusters.csv")

In [140]:
# Step 1: Expand artists column to handle collabs
df['artists'] = df['artists'].str.split(', ')
df_exploded = df.explode('artists')

# Step 2: Compute average popularity_index per artist
# AND count of unique songs they contributed to
artist_popularity = (
    df_exploded
    .groupby('artists')
    .agg(
        avg_popularity=('popularity_index', 'mean'),
        unique_tracks=('spotify_id', 'nunique')
    )
    .reset_index()
    .rename(columns={'artists': 'artist'})
)

# Step 3: Compute penalty factor based on unique_tracks
alpha = 20  # can tune this: higher = more penalty for fewer tracks
artist_popularity['penalty'] = (
    artist_popularity['unique_tracks'] / (artist_popularity['unique_tracks'] + alpha)
)

# Step 4: Adjust avg_popularity using the penalty
artist_popularity['adjusted_popularity'] = (
    artist_popularity['avg_popularity'] * artist_popularity['penalty']
)

# Step 5: Normalize adjusted_popularity to a 0–1 scale
min_score = artist_popularity['adjusted_popularity'].min()
max_score = artist_popularity['adjusted_popularity'].max()
artist_popularity['normalized_popularity'] = (
    (artist_popularity['adjusted_popularity'] - min_score) /
    (max_score - min_score)
)

# Step 6: Classify tiers using normalized values
quantiles = artist_popularity['normalized_popularity'].quantile([0.85, 0.5])
q80 = quantiles[0.85]
q50 = quantiles[0.5]

def classify_tier(score):
    if score >= q80:
        return "Headliner"
    elif score >= q50:
        return "Co-Headliner"
    else:
        return "Support"

artist_popularity['lineup_tier'] = artist_popularity['normalized_popularity'].apply(classify_tier)

# Step 7: Merge classification back to exploded song-artist data (optional)
df_final = df_exploded.merge(
    artist_popularity[['artist', 'avg_popularity', 'adjusted_popularity', 'normalized_popularity', 'lineup_tier']],
    left_on='artists',
    right_on='artist',
    how='left'
)

# Optional: Save to CSV
df_final.to_csv("us_lineup_tiers.csv", index=False)



In [141]:


# Optional: Save final artist list
artist_popularity.to_csv("artist_lineup_classification.csv", index=False)
df_final.to_csv("clustered_data_with_lineup_tiers.csv", index=False)