In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [32]:
# show all columns
pd.set_option('display.max_columns', None)

In [33]:
# Read TikTok Data
data = pd.read_csv('Spotify Data/data-clean.csv')

In [34]:
data['track_seconds'] = data['duration_ms'] / 1000

In [35]:
data.head()

Unnamed: 0,track_id,time_signature,chorus_hit,sections,target,popularity,sm_target,tiktok,spotify,track,artist,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,era,main_parent_genre,track_seconds
0,1dtKN6wwlolkM8XZy2y9C1,3.0,32.94975,9.0,1.0,,0.0,0,1,Jealous Kind Of Fella,Garland Green,173533.0,0.417,0.62,D# / Eb,-7.727,major,0.0403,0.49,0.0,0.0779,0.845,185.655,60s,Blues and Jazz,173.533
1,5hjsmSnUefdUqzsDogisiX,4.0,48.8251,10.0,0.0,,0.0,0,1,Initials B.B.,Serge Gainsbourg,213613.0,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.018,0.107,0.176,0.797,101.801,60s,Rock,213.613
2,6uk8tI6pwxxdVTNlNOJeJh,4.0,37.22663,12.0,0.0,,0.0,0,1,Melody Twist,Lord Melody,223960.0,0.657,0.649,F,-13.392,major,0.038,0.846,4e-06,0.119,0.908,115.94,60s,Other,223.96
3,7aNjMJ05FvUXACPWZ7yJmv,4.0,24.75484,8.0,0.0,,0.0,0,1,Mi Bomba Sonó,Celia Cruz,157907.0,0.59,0.545,G,-12.058,minor,0.104,0.706,0.0246,0.061,0.967,105.592,60s,World Music,157.907
4,1rQ0clvgkzWr001POOPJWx,4.0,21.79874,14.0,0.0,,0.0,0,1,Uravu Solla,P. Susheela,245600.0,0.515,0.765,B,-3.515,minor,0.124,0.857,0.000872,0.213,0.906,114.617,60s,Other,245.6


In [36]:
# Drop unnecessary columns
data = data.drop(["track_id", "era", "target", "sm_target", "popularity", "tiktok", "spotify", "track", "artist", "duration_ms", "key", "mode", "main_parent_genre"], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40560 entries, 0 to 40559
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time_signature    40560 non-null  float64
 1   chorus_hit        40560 non-null  float64
 2   sections          40560 non-null  float64
 3   danceability      40560 non-null  float64
 4   energy            40560 non-null  float64
 5   loudness          40560 non-null  float64
 6   speechiness       40560 non-null  float64
 7   acousticness      40560 non-null  float64
 8   instrumentalness  40560 non-null  float64
 9   liveness          40560 non-null  float64
 10  valence           40560 non-null  float64
 11  tempo             40560 non-null  float64
 12  track_seconds     40560 non-null  float64
dtypes: float64(13)
memory usage: 4.0 MB


In [37]:
#data = pd.get_dummies(data)

In [38]:
data

Unnamed: 0,time_signature,chorus_hit,sections,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_seconds
0,3.0,32.94975,9.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173.533
1,4.0,48.82510,10.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213.613
2,4.0,37.22663,12.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223.960
3,4.0,24.75484,8.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157.907
4,4.0,21.79874,14.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,4.0,24.30824,7.0,0.172,0.358,-14.430,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150.857
40556,4.0,32.53856,8.0,0.910,0.366,-9.954,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152.000
40557,4.0,20.73371,7.0,0.719,0.804,-4.581,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227.760
40558,4.0,21.65301,14.0,0.600,0.177,-16.070,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213.387


Loading the Model: You'll need to load the pre-trained prediction model. If it's pickled, you can load it using pickle.

In [39]:
#X_test = [[danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, chorus_hit, sections]]

#Create a df with the feature names and X Test as first row
#feature_names = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'time_signature', 'chorus_hit', 'sections']
#X_test_df = pd.DataFrame(X_test, columns=feature_names)
    
#load pickle and test if it works
#xgb_model_loaded = pickle.load(open('xgb_model.pkl', 'rb'))
#print(xgb_model_loaded.predict(X_test_df))
# Make predictions using the loaded model
#prediction = xgb_model_loaded.predict(X_test_df)

In [40]:
import pickle
# Load the saved random forest classifier from the file
filename = 'xgb_model.pkl'
model = pickle.load(open(filename, 'rb'))

# Use the loaded model for predictions
#predictions = model.predict(data)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [41]:
# Count the number of NaN values per feature
nan_counts = data.isna().sum()

# Print the NaN counts per feature
print(nan_counts)

time_signature      0
chorus_hit          0
sections            0
danceability        0
energy              0
loudness            0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
track_seconds       0
dtype: int64


In [42]:
data

Unnamed: 0,time_signature,chorus_hit,sections,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_seconds
0,3.0,32.94975,9.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173.533
1,4.0,48.82510,10.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213.613
2,4.0,37.22663,12.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223.960
3,4.0,24.75484,8.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157.907
4,4.0,21.79874,14.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,4.0,24.30824,7.0,0.172,0.358,-14.430,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150.857
40556,4.0,32.53856,8.0,0.910,0.366,-9.954,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152.000
40557,4.0,20.73371,7.0,0.719,0.804,-4.581,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227.760
40558,4.0,21.65301,14.0,0.600,0.177,-16.070,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213.387


Creating a Function to Adjust Features: This function will receive a feature value and a percentage change and return the adjusted value.

In [43]:
def adjust_feature(value, percentage):
    return value + (value * percentage / 100)

Testing Feature Adjustments: For each feature of a song, you'll want to test how increasing and decreasing it affects the model's prediction.

In [44]:
def test_adjustments(song_features, percentage_range):
    optimal_changes = {}
    for feature, value in song_features.items():
        input_data = np.array(list(song_features.values()))
        max_prob = model.predict_proba([input_data])[0][1]  # probability of being a hit
        optimal_change = 0  # no change

        for percentage in percentage_range:
            # Increase feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, percentage)
            prob = model.predict_proba([adjusted_features])[0][1]
            if prob > max_prob:
                max_prob = prob
                optimal_change = percentage

            # Decrease feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, -percentage)
            prob = model.predict_proba([adjusted_features])[0][1]
            if prob > max_prob:
                max_prob = prob
                optimal_change = -percentage

        optimal_changes[feature] = optimal_change

    return optimal_changes


Note: The percentage_range parameter in this function could be something like range(-50, 51, 5), which would test each feature by decreasing and increasing it by 5% increments, up to 50%.

Providing Recommendations: Use the test_adjustments function to provide recommendations on how to improve a song.

In [46]:
song_features = {'time_signature': 3, 'chorus_hit': 31, 'sections': 10,
                 "danceability": 0.417, "energy": 0.620, "loudness":-7.727,
                   "speechiness": 0.0403, "acousticness":0.4900, "instrumentalness": 0.0000,
                     "liveness": 0.0779, "valence": 0.8450, "tempo": 185.655, "track_second": 173.533}  # example song features


recommendations = test_adjustments(song_features, range(1, 99, 20))

for feature, change in recommendations.items():
    if change > 0:
        print(f"Increase {feature} by {change}% for a higher probability of being a hit.")
    elif change < 0:
        print(f"Decrease {feature} by {-change}% for a higher probability of being a hit.")
    else:
        print(f"{feature} is optimal as it is.")


AttributeError: 'GradientBoostingClassifier' object has no attribute 'loss_'

This will provide specific, actionable recommendations on how to adjust each song feature for a higher probability of being a hit, according to your prediction model. However, keep in mind that the model's performance and recommendations will only be as good as the data it was trained on. Always use a variety of data sources and constantly update your model for the best results.