In [1]:
from joblib import load

# Load the previously saved model
model = load('trained_model.joblib')

In [2]:
import numpy as np

def optimize_feature_for_song(model, song_df, song_index=0, delta=0.2):
    """
    Optimizes a single feature for a given song (row in the DataFrame) to maximize predicted popularity.
    
    Parameters:
    - model: Trained regression model with a `predict` method.
    - song_df: DataFrame, where each row represents a song's features. Expects a column 'popularity'.
    - song_index: Index of the song in the DataFrame to optimize.
    - delta: Maximum allowed change for any feature (default: 0.2).
    
    Returns:
    - best_feature: The name of the feature to change (from song_df columns).
    - optimal_value: The new value for the selected feature.
    - predicted_increase: The expected increase in popularity.
    """
    best_feature = None
    optimal_value = None
    max_increase = -np.inf
    
    # Get the feature names by excluding 'popularity' column
    feature_names = song_df.drop('popularity', axis=1).columns.tolist()
    
    # Get the feature vector for the specified song (excluding 'popularity')
    original_features = song_df.drop("popularity", axis=1).iloc[song_index].values
    original_popularity = model.predict([original_features])[0]  # Predict initial popularity
    
    # Loop over each feature in the song's feature vector
    for j in range(len(original_features)):  
        original_value = original_features[j]
        
        # Generate candidate values within the allowed range
        candidates = np.linspace(max(0, original_value - delta), 
                                 min(1, original_value + delta), 
                                 10)
        
        for candidate in candidates:
            # Perturb the feature and predict the new popularity
            perturbed_features = original_features.copy()
            perturbed_features[j] = candidate
            predicted_popularity = model.predict([perturbed_features])[0]
            
            # Calculate the increase in popularity
            increase = predicted_popularity - original_popularity
            
            if increase > max_increase:
                max_increase = increase
                best_feature = feature_names[j]  # Store the name of the best feature
                optimal_value = candidate
    
    return best_feature, optimal_value, max_increase

In [14]:
import pandas as pd

# Load data
df_encoded = pd.read_csv('data/spotify_data_updated.csv')

# Find a song
#song = df_encoded[df_encoded['track_id'] == "6k5DKQMC96daK6fSvAxdQv"]
song = df_encoded.sample(n=1)

song = song.drop(["Unnamed: 0", "artist_name", "track_name", "track_id", "genre", "popularity_category"], axis=1)
print(song)

best_feature, optimal_value, predicted_increase = optimize_feature_for_song(
    model=model,
    song_df=song
)

print(best_feature)
print(optimal_value)
print(predicted_increase)

       popularity  year  danceability  energy  key  loudness  mode  \
84313          15  2013         0.354   0.945    2    -4.456     0   

       speechiness  acousticness  instrumentalness  liveness  valence  \
84313       0.0712        0.0139          0.000003    0.0738    0.127   

         tempo  duration_ms  time_signature  cluster_1  cluster_2  cluster_3  
84313  149.883       294800               4      False      False      False  




energy
0.7449999999999999
2.3809111803377974




In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('data/spotify_data_updated.csv')

# Assuming your DataFrame has columns like 'best_feature', 'optimal_value', 'predicted_increase'
# If not, create these columns first
if 'best_feature' not in df.columns:
    df['best_feature'] = None
if 'optimal_value' not in df.columns:
    df['optimal_value'] = None
if 'predicted_increase' not in df.columns:
    df['predicted_increase'] = None

# Save the updated DataFrame back to the CSV file
df.to_csv('data/spotify_data_updated.csv', index=False)

# Iterate over the DataFrame, but only for rows where 'best_feature' is empty or None
for index in df.index:

    # Check if 'best_feature' is empty or NaN
    if pd.isna(df.at[index, 'best_feature']) or df.at[index, 'best_feature'] == '':
        # Get the song's feature vector (excluding the target column)
        song = df.loc[[index]].drop(["Unnamed: 0", "artist_name", "track_name", "track_id", "genre", "popularity_category", "best_feature", "optimal_value", "predicted_increase"], axis=1)

        # Optimize the feature for the song
        best_feature, optimal_value, predicted_increase = optimize_feature_for_song(
            model=model,
            song_df=song
        )
        
        # Update the DataFrame with the results
        df.at[index, 'best_feature'] = best_feature
        df.at[index, 'optimal_value'] = optimal_value
        df.at[index, 'predicted_increase'] = predicted_increase

    # Save the updated DataFrame back to the CSV file
    df.to_csv('data/spotify_data_updated.csv', index=False)

  df = pd.read_csv('data/spotify_data_updated.csv')
