## 1. Prepare data to run model

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [7]:
# show all columns
pd.set_option('display.max_columns', None)

In [8]:
# Read TikTok Data
data = pd.read_csv('Spotify Data/data-clean.csv')

In [9]:
data['track_seconds'] = data['duration_ms'] / 1000

In [10]:
data.head()

Unnamed: 0,track_id,time_signature,chorus_hit,sections,target,popularity,sm_target,tiktok,spotify,track,artist,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,era,main_parent_genre,track_seconds
0,1dtKN6wwlolkM8XZy2y9C1,3.0,32.94975,9.0,1.0,,0.0,0,1,Jealous Kind Of Fella,Garland Green,173533.0,0.417,0.62,D# / Eb,-7.727,major,0.0403,0.49,0.0,0.0779,0.845,185.655,60s,Blues and Jazz,173.533
1,5hjsmSnUefdUqzsDogisiX,4.0,48.8251,10.0,0.0,,0.0,0,1,Initials B.B.,Serge Gainsbourg,213613.0,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.018,0.107,0.176,0.797,101.801,60s,Rock,213.613
2,6uk8tI6pwxxdVTNlNOJeJh,4.0,37.22663,12.0,0.0,,0.0,0,1,Melody Twist,Lord Melody,223960.0,0.657,0.649,F,-13.392,major,0.038,0.846,4e-06,0.119,0.908,115.94,60s,Other,223.96
3,7aNjMJ05FvUXACPWZ7yJmv,4.0,24.75484,8.0,0.0,,0.0,0,1,Mi Bomba Sonó,Celia Cruz,157907.0,0.59,0.545,G,-12.058,minor,0.104,0.706,0.0246,0.061,0.967,105.592,60s,World Music,157.907
4,1rQ0clvgkzWr001POOPJWx,4.0,21.79874,14.0,0.0,,0.0,0,1,Uravu Solla,P. Susheela,245600.0,0.515,0.765,B,-3.515,minor,0.124,0.857,0.000872,0.213,0.906,114.617,60s,Other,245.6


In [11]:
# Drop unnecessary columns
data = data.drop(["track_id", "era", "sm_target", "popularity", "tiktok", "spotify", "track", "artist", "duration_ms", "key", "mode", "main_parent_genre"], axis=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40560 entries, 0 to 40559
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time_signature    40560 non-null  float64
 1   chorus_hit        40560 non-null  float64
 2   sections          40560 non-null  float64
 3   target            40560 non-null  float64
 4   danceability      40560 non-null  float64
 5   energy            40560 non-null  float64
 6   loudness          40560 non-null  float64
 7   speechiness       40560 non-null  float64
 8   acousticness      40560 non-null  float64
 9   instrumentalness  40560 non-null  float64
 10  liveness          40560 non-null  float64
 11  valence           40560 non-null  float64
 12  tempo             40560 non-null  float64
 13  track_seconds     40560 non-null  float64
dtypes: float64(14)
memory usage: 4.3 MB


In [12]:
#data = pd.get_dummies(data)

In [13]:
data

Unnamed: 0,time_signature,chorus_hit,sections,target,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_seconds
0,3.0,32.94975,9.0,1.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173.533
1,4.0,48.82510,10.0,0.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213.613
2,4.0,37.22663,12.0,0.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223.960
3,4.0,24.75484,8.0,0.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157.907
4,4.0,21.79874,14.0,0.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,4.0,24.30824,7.0,0.0,0.172,0.358,-14.430,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150.857
40556,4.0,32.53856,8.0,1.0,0.910,0.366,-9.954,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152.000
40557,4.0,20.73371,7.0,1.0,0.719,0.804,-4.581,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227.760
40558,4.0,21.65301,14.0,0.0,0.600,0.177,-16.070,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213.387


## 2. Get Model to predict Hit/Flop

Loading the Model: You'll need to load the pre-trained prediction model. If it's pickled, you can load it using pickle.

In [14]:
# import model
import pickle

# Load the model from the .pkl file
with open('xgb_model_genre.pkl', 'rb') as file:
    model = pickle.load(file)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
import itertools

In [16]:
y = data['target']
X = data.drop(columns=['target'])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X2_train, X2_test, y2_train, y2_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


In [17]:
xgb2=GradientBoostingClassifier(                   
                   max_depth=3, 
                   max_features=4,
                   max_leaf_nodes=None,                   
                   #min_samples_leaf=10,
                   n_estimators=100,
                   learning_rate=0.1)
xgb2b = xgb2.fit(X2_train,y2_train)

grad_ypred = xgb2.predict(X2_test)

grad_yproba = xgb2.predict_proba(X2_test)[:,1]

#fpr_grad, tpr_grad, _ = roc_curve(y2_test, grad_yproba)
#plot_model(xgb2, X2_train, y2_train, X2_test, y2_test)

In [18]:
# Predict the labels for the test set
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = xgb2.predict(X_test)

# Evaluate the classifier
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.7794625246548323
Confusion Matrix:
 [[2888 1221]
 [ 568 3435]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.70      0.76      4109
         1.0       0.74      0.86      0.79      4003

    accuracy                           0.78      8112
   macro avg       0.79      0.78      0.78      8112
weighted avg       0.79      0.78      0.78      8112



In [19]:
data

Unnamed: 0,time_signature,chorus_hit,sections,target,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_seconds
0,3.0,32.94975,9.0,1.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173.533
1,4.0,48.82510,10.0,0.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213.613
2,4.0,37.22663,12.0,0.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223.960
3,4.0,24.75484,8.0,0.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157.907
4,4.0,21.79874,14.0,0.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,4.0,24.30824,7.0,0.0,0.172,0.358,-14.430,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150.857
40556,4.0,32.53856,8.0,1.0,0.910,0.366,-9.954,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152.000
40557,4.0,20.73371,7.0,1.0,0.719,0.804,-4.581,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227.760
40558,4.0,21.65301,14.0,0.0,0.600,0.177,-16.070,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213.387


## 3. Build code to give recommendation for a feature

In [20]:
data_tofindtrack = pd.read_csv('Spotify Data/data-clean.csv')
#create duration_ms
data_tofindtrack['track_seconds'] = data_tofindtrack['duration_ms'] / 1000
# Drop unnecessary columns
data_tofindtrack = data_tofindtrack.drop(["era", "sm_target", "popularity", "tiktok", "spotify", "track", "artist", "duration_ms", "key", "mode", "main_parent_genre"], axis=1)
data_tofindtrack

Unnamed: 0,track_id,time_signature,chorus_hit,sections,target,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_seconds
0,1dtKN6wwlolkM8XZy2y9C1,3.0,32.94975,9.0,1.0,0.417,0.620,-7.727,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173.533
1,5hjsmSnUefdUqzsDogisiX,4.0,48.82510,10.0,0.0,0.498,0.505,-12.475,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213.613
2,6uk8tI6pwxxdVTNlNOJeJh,4.0,37.22663,12.0,0.0,0.657,0.649,-13.392,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223.960
3,7aNjMJ05FvUXACPWZ7yJmv,4.0,24.75484,8.0,0.0,0.590,0.545,-12.058,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157.907
4,1rQ0clvgkzWr001POOPJWx,4.0,21.79874,14.0,0.0,0.515,0.765,-3.515,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40555,4t1TljQWJ6ZuoSY67zVvBI,4.0,24.30824,7.0,0.0,0.172,0.358,-14.430,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150.857
40556,2MShy1GSSgbmGUxADNIao5,4.0,32.53856,8.0,1.0,0.910,0.366,-9.954,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152.000
40557,55qBw1900pZKfXJ6Q9A2Lc,4.0,20.73371,7.0,1.0,0.719,0.804,-4.581,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227.760
40558,4o9npmYHrOF1rUxxTVH8h4,4.0,21.65301,14.0,0.0,0.600,0.177,-16.070,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213.387


In [21]:
# define values to multiply in iteration (here percentage)
values = [1, 0.8, 1.2, 0.6, 1.4, 0.4, 1.6, 0.2]
# method to tune feature and say if it gets to a HIT
def checkfeature (insert_feature, song_df):
    song = song_df
    feature = insert_feature
    for value in values:
        song_copy = song.copy()  # Create a copy of the DataFrame
        feature = insert_feature
        song_copy[feature] = song_copy[feature] * value 
        pred = model.predict(song_copy)
        #print(value)

        if pred[0] > 0: # if a HIT is reached give print statement
            print ("HIT reached")
            print ("You have to change " + str(feature) +" by " + str(value))
            return value

In [22]:
# if it's a success return 1
def success (insert_feature, song_df):
    if checkfeature(insert_feature, song_df ) is None:
        print ("no success with " + str(insert_feature))
        return 0
    print ("success")
    return 1


In [23]:
# Check for a song location
songs_df = songs_df = data.drop("target", axis=1)
selected_song_df = songs_df.iloc[40555: 40556]
pred = model.predict(selected_song_df)
print (pred[0])



ValueError: X has 13 features, but GradientBoostingClassifier is expecting 15 features as input.

In [None]:
# Check for a songs track_id
def id_to_df (track_id):
    track_df = data_tofindtrack[data_tofindtrack['track_id'] == track_id]
    track_df = track_df.drop(['target', "track_id"], axis=1)
    return track_df

#id_to_df("4t1TljQWJ6ZuoSY67zVvBI")

In [None]:
tuningfeatures = ["loudness", "danceability", "acousticness","chorus_hit","sections", 
                  "energy", "speechiness","instrumentalness","liveness",
                  "valence","tempo"]
# "track_seconds"

# THIS IS THE METHOD TO INSERT A TRACK ID AND GET THE FEEDBACK WHAT TO TUNE
def test (track_id):
    song_df = id_to_df(track_id)
    for feature in tuningfeatures:
        if success (feature, song_df) ==1:
            print ("you have reached a HIT")
            return 1

test ("4t1TljQWJ6ZuoSY67zVvBI")

no success with loudness
no success with danceability
no success with acousticness
no success with chorus_hit
no success with sections
no success with energy
no success with speechiness
no success with instrumentalness
no success with liveness
no success with valence
no success with tempo


In [None]:
#checkfeature ("acousticness")

In [None]:
#checkfeature ("loudness")

In [None]:
#checkfeature ("danceability")

## Rest

Creating a Function to Adjust Features: This function will receive a feature value and a percentage change and return the adjusted value.

In [None]:
def adjust_feature(value, percentage):
    return value + (value * percentage / 100)

Testing Feature Adjustments: For each feature of a song, you'll want to test how increasing and decreasing it affects the model's prediction.

In [None]:
#def test_adjustments(song_features, percentage_range):
optimal_changes = {}
    for feature, value in song_features.items():
        input_data = np.array(list(song_features.values()))
        max_prob = predict_proba_gb(xgb2, [input_data])[0][1]  # probability of being a hit
        optimal_change = 0  # no change

        for percentage in percentage_range:
            # Increase feature
            # Increase feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, percentage)
            input_data = np.array(list(adjusted_features.values()))
            prob = predict_proba_gb(xgb2, [input_data])[0][1]
            
            if prob > max_prob:
                max_prob = prob
            optimal_change = percentage
            
            #adjusted_features = song_features.copy()
            #adjusted_features[feature] = adjust_feature(value, percentage)
            #prob = model.predict_proba([adjusted_features])[0][1]
            #if prob > max_prob:
                #max_prob = prob
                #optimal_change = percentage

            # Decrease feature
            adjusted_features = song_features.copy()
            adjusted_features[feature] = adjust_feature(value, -percentage)
            input_data = np.array(list(adjusted_features.values()))
            prob = predict_proba_gb(xgb2, [input_data])[0][1]
            if prob > max_prob:
                max_prob = prob
            optimal_change = -percentage

            #adjusted_features = song_features.copy()
            #adjusted_features[feature] = adjust_feature(value, -percentage)
            #prob = model.predict_proba([adjusted_features])[0][1]
            #if prob > max_prob:
                #max_prob = prob
                #optimal_change = -percentage

        optimal_changes[feature] = optimal_change

    return optimal_changes


IndentationError: unexpected indent (1133524161.py, line 3)

Note: The percentage_range parameter in this function could be something like range(-50, 51, 5), which would test each feature by decreasing and increasing it by 5% increments, up to 50%.

Providing Recommendations: Use the test_adjustments function to provide recommendations on how to improve a song.

In [None]:
# song_features = {'time_signature': 4.0, 'chorus_hit': 24.30824, 'sections': 7.0,
#                 "danceability": 0.0, "energy": 0.172, "loudness":-14.430,
#                   "speechiness": 0.0403, "acousticness":0.4900, "instrumentalness": 0.0000,
#                     "liveness": 0.0779, "valence": 0.8450, "tempo": 185.655, "track_second": 173.533}  # example song features

song_features = {'time_signature': 0.0, 'chorus_hit': 0, 'sections': 0,
                "danceability": 0, "energy": 0, "loudness":0.0,
                  "speechiness": 0.0, "acousticness":0.0, "instrumentalness": 0.0,                     
                  "liveness": 0.0, "valence": 0.0, "tempo": 0, "track_second": 0}  # example song features


recommendations = test_adjustments(song_features, range(0, 21, 5))

for feature, change in recommendations.items():
    if change > 0:
        print(f"Increase {feature} by {change}% for a higher probability of being a hit.")
    elif change < 0:
        print(f"Decrease {feature} by {-change}% for a higher probability of being a hit.")
    else:
        print(f"{feature} is optimal as it is.")


Decrease time_signature by 20% for a higher probability of being a hit.
Decrease chorus_hit by 20% for a higher probability of being a hit.
Decrease sections by 20% for a higher probability of being a hit.
Decrease danceability by 20% for a higher probability of being a hit.
Decrease energy by 20% for a higher probability of being a hit.
Decrease loudness by 20% for a higher probability of being a hit.
Decrease speechiness by 20% for a higher probability of being a hit.
Decrease acousticness by 20% for a higher probability of being a hit.
Decrease instrumentalness by 20% for a higher probability of being a hit.
Decrease liveness by 20% for a higher probability of being a hit.
Decrease valence by 20% for a higher probability of being a hit.
Decrease tempo by 20% for a higher probability of being a hit.
Decrease track_second by 20% for a higher probability of being a hit.




This will provide specific, actionable recommendations on how to adjust each song feature for a higher probability of being a hit, according to your prediction model. However, keep in mind that the model's performance and recommendations will only be as good as the data it was trained on. Always use a variety of data sources and constantly update your model for the best results.