In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [21]:
# show all columns
pd.set_option('display.max_columns', None)

In [22]:
# Read TikTok Data
data = pd.read_csv('TikTokSpotifyMerged.csv')

In [23]:
# Drop unnecessary columns
data = data.drop(['track_id', "target", "popularity", "target", "sections", "chorus_hit", "time_signature" ], axis=1)
data = data.drop(['tiktok', 'artist', 'spotify', "track", "sm_target"], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43808 entries, 0 to 43807
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   duration_ms        43808 non-null  float64
 1   danceability       43808 non-null  float64
 2   energy             43808 non-null  float64
 3   key                43808 non-null  object 
 4   loudness           43808 non-null  float64
 5   mode               43808 non-null  object 
 6   speechiness        43808 non-null  float64
 7   acousticness       43808 non-null  float64
 8   instrumentalness   43808 non-null  float64
 9   liveness           43808 non-null  float64
 10  valence            43808 non-null  float64
 11  tempo              43808 non-null  float64
 12  era                43808 non-null  object 
 13  main_parent_genre  43808 non-null  object 
dtypes: float64(10), object(4)
memory usage: 4.7+ MB


In [24]:
data = pd.get_dummies(data)

In [25]:
data

Unnamed: 0,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key_A,key_A# / Bb,key_B,key_C,key_C# / Db,key_D,key_D# / Eb,key_E,key_F,key_F# / Gb,key_G,key_G# / Ab,mode_major,mode_minor,era_00s,era_10s,era_20s,era_60s,era_70s,era_80s,era_90s,main_parent_genre_Blues and Jazz,main_parent_genre_Classical and Opera,main_parent_genre_Country and Folk,main_parent_genre_Electronic Music and Dance,main_parent_genre_Other,main_parent_genre_Pop,main_parent_genre_Rap and Hip Hop,main_parent_genre_Reggae and Ska,main_parent_genre_Rock,main_parent_genre_World Music
0,173533.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.845,185.655,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,213613.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.797,101.801,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,223960.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.908,115.940,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,157907.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.967,105.592,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,245600.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.906,114.617,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43803,173123.0,0.639,0.546,-5.382,0.0407,0.0837,0.000000,0.1220,0.149,122.179,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
43804,209426.0,0.595,0.689,-6.107,0.1480,0.2540,0.000005,0.1290,0.213,168.112,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
43805,202440.0,0.591,0.818,-3.532,0.0730,0.1720,0.000000,0.1260,0.574,108.107,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
43806,160346.0,0.788,0.473,-12.744,0.0328,0.4890,0.254000,0.1090,0.810,105.429,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0


Loading the Model: You'll need to load the pre-trained prediction model. If it's pickled, you can load it using pickle.

In [26]:
import pickle
# Load the saved random forest classifier from the file
filename = 'randomforest_model.pkl'
model = pickle.load(open(filename, 'rb'))

# Use the loaded model for predictions
predictions = model.predict(data)

Creating a Function to Adjust Features: This function will receive a feature value and a percentage change and return the adjusted value.

In [27]:
def adjust_feature(value, percentage):
    return value + (value * percentage / 100)

Testing Feature Adjustments: For each feature of a song, you'll want to test how increasing and decreasing it affects the model's prediction.

In [28]:
def test_adjustments(song_features, percentage_range):
    optimal_changes = {}
    for feature, value in song_features.items():
        max_prob = model.predict_proba([song_features])[0][1]  # probability of being a hit
        optimal_change = 0  # no change

        for percentage in percentage_range:
            # Increase feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, percentage)
            prob = model.predict_proba([adjusted_features])[0][1]
            if prob > max_prob:
                max_prob = prob
                optimal_change = percentage

            # Decrease feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, -percentage)
            prob = model.predict_proba([adjusted_features])[0][1]
            if prob > max_prob:
                max_prob = prob
                optimal_change = -percentage

        optimal_changes[feature] = optimal_change

    return optimal_changes


Note: The percentage_range parameter in this function could be something like range(-50, 51, 5), which would test each feature by decreasing and increasing it by 5% increments, up to 50%.

Providing Recommendations: Use the test_adjustments function to provide recommendations on how to improve a song.

In [30]:
song_features = {'acousticness': 0.5, 'danceability': 0.7, 'energy': 0.6}  # example song features

recommendations = test_adjustments(song_features, range(-50, 51, 5))

for feature, change in recommendations.items():
    if change > 0:
        print(f"Increase {feature} by {change}% for a higher probability of being a hit.")
    elif change < 0:
        print(f"Decrease {feature} by {-change}% for a higher probability of being a hit.")
    else:
        print(f"{feature} is optimal as it is.")




TypeError: float() argument must be a string or a number, not 'dict'

This will provide specific, actionable recommendations on how to adjust each song feature for a higher probability of being a hit, according to your prediction model. However, keep in mind that the model's performance and recommendations will only be as good as the data it was trained on. Always use a variety of data sources and constantly update your model for the best results.