# TikTok Performance predictor 

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# show all columns
pd.set_option('display.max_columns', None)

In [29]:
# Read TikTok Data
tik = pd.read_csv('tiktok.csv')

In [30]:
# merge spotify data to one file and add a column "era"
df60 = pd.read_csv('Spotify Data/dataset-of-60s.csv')
df60['era'] = '60s'

df70 = pd.read_csv('Spotify Data/dataset-of-70s.csv')
df70['era'] = '70s'

df80 = pd.read_csv('Spotify Data/dataset-of-80s.csv')
df80['era'] = '80s'

df90 = pd.read_csv('Spotify Data/dataset-of-90s.csv')
df90['era'] = '90s'

df00 = pd.read_csv('Spotify Data/dataset-of-00s.csv')
df00['era'] = '00s'

df10 = pd.read_csv('Spotify Data/dataset-of-10s.csv')
df10['era'] = '10s'

df20 = pd.read_csv('billboard_hot_100_audio_features.csv')
df20['era'] = '20s'

spotify = pd.concat([df60, df70, df80, df90, df00, df10], ignore_index=True)

In [31]:
# adapting 'release_date' to show "era" in the tik dataframe
tik['release_year'] = tik['release_date'].apply(lambda x: x.split('-')[0])

era_ranges = [(1960, 1969, "60s"),
              (1970, 1979, "70s"),
              (1980, 1989, "80s"),
              (1990, 1999, "90s"),
              (2000, 2009, "00s"),
              (2010, 2019, "10s"),
              (2020, 2029, "20s")]

# Function to assign era based on release year
def assign_era(year):
    for era_range in era_ranges:
        if era_range[0] <= year <= era_range[1]:
            return era_range[2]
    return "Unknown"  # In case the year doesn't fall into any specified range

# Apply the assign_era function to create the era column
tik['era'] = tik['release_year'].apply(lambda x: assign_era(int(x)))

tik.drop('release_year', axis=1, inplace=True)
tik.drop('release_date', axis=1, inplace=True)

In [32]:
# checking if there is "-1" value for "key"
if -1 in spotify['key'].values:
    print("There is a value of -1 in the column")
else:
    print("There is no value of -1 in the column")

if -1 in tik['key'].values:
    print("There is a value of -1 in the column")
else:
    print("There is no value of -1 in the column")

There is no value of -1 in the column
There is no value of -1 in the column


In [33]:
# define key_mapping dictionary
key_mapping = {
    0: 'C',
    1: 'C# / Db',
    2: 'D',
    3: 'D# / Eb',
    4: 'E',
    5: 'F',
    6: 'F# / Gb',
    7: 'G',
    8: 'G# / Ab',
    9: 'A',
    10: 'A# / Bb',
    11: 'B'
}

# use dictionary to replace numerical values with string values
spotify['key'] = spotify['key'].replace(key_mapping)
tik['key'] = tik['key'].replace(key_mapping)


In [34]:
# define mode_mapping dictionary
mode_mapping = {0: 'minor', 1: 'major'}

# use dictionary to replace numerical values with string values
spotify['mode'] = spotify['mode'].replace(mode_mapping)
tik['mode'] = tik['mode'].replace(mode_mapping)

In [35]:
# Formatting spotify uri by removing "spotify:track:"
spotify['uri'] = spotify['uri'].str.replace('spotify:track:', '')

# Umbenenen der Splate uri in track_id
spotify.rename(columns={'uri': 'track_id'}, inplace=True)

In [36]:
tik.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,playlist_id,playlist_name,duration_mins,genre,era
0,6kVuF2PYLuvl9T85XjNbaO,Lay It Down Gmix - Main,1Xfmvd48oOhEWkscWyEbh9,Lloyd,43C6GVlhXG4KfZuEbxty3r,302186,28,0.597,0.8,C# / Db,-5.423,minor,0.312,0.0461,0.0,0.18,0.565,155.932,6kVuF2PYLuvl9T85XjNbaO,6kVuF2PYLuvl9T85XjNbaO,5.036433,TIKTOK DANCE,10s
1,1RGIjMFMgJxkZHMDXVYzOJ,Bartender (feat. Akon),3aQeKQSyrW4qWr35idm0cy,T-Pain,6CrSEKCF4TYrbSIitegb3h,238800,75,0.832,0.391,G# / Ab,-8.504,major,0.0628,0.0564,0.0,0.224,0.436,104.961,1RGIjMFMgJxkZHMDXVYzOJ,1RGIjMFMgJxkZHMDXVYzOJ,3.98,TIKTOK DANCE,00s
2,1RGIjMFMgJxkZHMDXVYzOJ,Bartender (feat. Akon),3aQeKQSyrW4qWr35idm0cy,T-Pain,6CrSEKCF4TYrbSIitegb3h,238800,75,0.832,0.391,G# / Ab,-8.504,major,0.0628,0.0564,0.0,0.224,0.436,104.961,1RGIjMFMgJxkZHMDXVYzOJ,1RGIjMFMgJxkZHMDXVYzOJ,3.98,TIKTOK DANCE,00s
3,1dIWPXMX4kRHj6Dt2DStUQ,Chosen (feat. Ty Dolla $ign),4qXC0i02bSFstECuXP2ZpL,Blxst,7AwrgenNcTAJlJF3pKL0Qr,161684,76,0.571,0.767,D,-5.16,major,0.287,0.336,0.0,0.0809,0.605,93.421,1dIWPXMX4kRHj6Dt2DStUQ,1dIWPXMX4kRHj6Dt2DStUQ,2.694733,TIKTOK DANCE,20s
4,4QVS8YCpK71R4FsxSMCjhP,Tie Me Down (with Elley Duhé),2ZRQcIgzPCVaT9XKhXZIzh,Gryffin,69t8rpgBN1ov5kCU6LDMuR,218295,72,0.548,0.839,F# / Gb,-2.371,major,0.0644,0.135,0.0,0.102,0.314,98.932,4QVS8YCpK71R4FsxSMCjhP,4QVS8YCpK71R4FsxSMCjhP,3.63825,TIKTOK DANCE,10s


In [37]:
spotify.head()

Unnamed: 0,track,artist,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,era
0,Jealous Kind Of Fella,Garland Green,1dtKN6wwlolkM8XZy2y9C1,0.417,0.62,D# / Eb,-7.727,major,0.0403,0.49,0.0,0.0779,0.845,185.655,173533,3,32.94975,9,1,60s
1,Initials B.B.,Serge Gainsbourg,5hjsmSnUefdUqzsDogisiX,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.018,0.107,0.176,0.797,101.801,213613,4,48.8251,10,0,60s
2,Melody Twist,Lord Melody,6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,F,-13.392,major,0.038,0.846,4e-06,0.119,0.908,115.94,223960,4,37.22663,12,0,60s
3,Mi Bomba Sonó,Celia Cruz,7aNjMJ05FvUXACPWZ7yJmv,0.59,0.545,G,-12.058,minor,0.104,0.706,0.0246,0.061,0.967,105.592,157907,4,24.75484,8,0,60s
4,Uravu Solla,P. Susheela,1rQ0clvgkzWr001POOPJWx,0.515,0.765,B,-3.515,minor,0.124,0.857,0.000872,0.213,0.906,114.617,245600,4,21.79874,14,0,60s


In [38]:
#rename features, to later be able to link the data easily
column_name_changes = {
    'track_name': 'track',
    'artist_name': 'artist',
    'duration': 'duration_ms',
    # Add more column name changes here if needed
}
# Rename the columns in df2
tik.rename(columns=column_name_changes, inplace=True)

features_to_delete = ['artist_id', 'album_id', "playlist_id", "playlist_name", "duration_mins"]
# Add more feature names to be deleted as needed

# Delete the features from df2
tik.drop(features_to_delete, axis=1, inplace=True)

In [39]:
tik.head()

Unnamed: 0,track_id,track,artist,duration_ms,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre,era
0,6kVuF2PYLuvl9T85XjNbaO,Lay It Down Gmix - Main,Lloyd,302186,28,0.597,0.8,C# / Db,-5.423,minor,0.312,0.0461,0.0,0.18,0.565,155.932,TIKTOK DANCE,10s
1,1RGIjMFMgJxkZHMDXVYzOJ,Bartender (feat. Akon),T-Pain,238800,75,0.832,0.391,G# / Ab,-8.504,major,0.0628,0.0564,0.0,0.224,0.436,104.961,TIKTOK DANCE,00s
2,1RGIjMFMgJxkZHMDXVYzOJ,Bartender (feat. Akon),T-Pain,238800,75,0.832,0.391,G# / Ab,-8.504,major,0.0628,0.0564,0.0,0.224,0.436,104.961,TIKTOK DANCE,00s
3,1dIWPXMX4kRHj6Dt2DStUQ,Chosen (feat. Ty Dolla $ign),Blxst,161684,76,0.571,0.767,D,-5.16,major,0.287,0.336,0.0,0.0809,0.605,93.421,TIKTOK DANCE,20s
4,4QVS8YCpK71R4FsxSMCjhP,Tie Me Down (with Elley Duhé),Gryffin,218295,72,0.548,0.839,F# / Gb,-2.371,major,0.0644,0.135,0.0,0.102,0.314,98.932,TIKTOK DANCE,10s


In [40]:
spotify.head()

Unnamed: 0,track,artist,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,era
0,Jealous Kind Of Fella,Garland Green,1dtKN6wwlolkM8XZy2y9C1,0.417,0.62,D# / Eb,-7.727,major,0.0403,0.49,0.0,0.0779,0.845,185.655,173533,3,32.94975,9,1,60s
1,Initials B.B.,Serge Gainsbourg,5hjsmSnUefdUqzsDogisiX,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.018,0.107,0.176,0.797,101.801,213613,4,48.8251,10,0,60s
2,Melody Twist,Lord Melody,6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,F,-13.392,major,0.038,0.846,4e-06,0.119,0.908,115.94,223960,4,37.22663,12,0,60s
3,Mi Bomba Sonó,Celia Cruz,7aNjMJ05FvUXACPWZ7yJmv,0.59,0.545,G,-12.058,minor,0.104,0.706,0.0246,0.061,0.967,105.592,157907,4,24.75484,8,0,60s
4,Uravu Solla,P. Susheela,1rQ0clvgkzWr001POOPJWx,0.515,0.765,B,-3.515,minor,0.124,0.857,0.000872,0.213,0.906,114.617,245600,4,21.79874,14,0,60s


In [41]:
same_tracks_df = pd.merge(spotify, tik, on='track_id', how='inner')
track_ids = same_tracks_df['track_id'].values

In [42]:
#create merge
data = pd.merge(spotify, tik, on='track_id', how='outer')

In [43]:
data

Unnamed: 0,track_x,artist_x,track_id,danceability_x,energy_x,key_x,loudness_x,mode_x,speechiness_x,acousticness_x,instrumentalness_x,liveness_x,valence_x,tempo_x,duration_ms_x,time_signature,chorus_hit,sections,target,era_x,track_y,artist_y,duration_ms_y,popularity,danceability_y,energy_y,key_y,loudness_y,mode_y,speechiness_y,acousticness_y,instrumentalness_y,liveness_y,valence_y,tempo_y,genre,era_y
0,Jealous Kind Of Fella,Garland Green,1dtKN6wwlolkM8XZy2y9C1,0.417,0.620,D# / Eb,-7.727,major,0.0403,0.490,0.000000,0.0779,0.845,185.655,173533.0,3.0,32.94975,9.0,1.0,60s,,,,,,,,,,,,,,,,,
1,Initials B.B.,Serge Gainsbourg,5hjsmSnUefdUqzsDogisiX,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.018,0.107000,0.1760,0.797,101.801,213613.0,4.0,48.82510,10.0,0.0,60s,,,,,,,,,,,,,,,,,
2,Melody Twist,Lord Melody,6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,F,-13.392,major,0.0380,0.846,0.000004,0.1190,0.908,115.940,223960.0,4.0,37.22663,12.0,0.0,60s,,,,,,,,,,,,,,,,,
3,Mi Bomba Sonó,Celia Cruz,7aNjMJ05FvUXACPWZ7yJmv,0.590,0.545,G,-12.058,minor,0.1040,0.706,0.024600,0.0610,0.967,105.592,157907.0,4.0,24.75484,8.0,0.0,60s,,,,,,,,,,,,,,,,,
4,Uravu Solla,P. Susheela,1rQ0clvgkzWr001POOPJWx,0.515,0.765,B,-3.515,minor,0.1240,0.857,0.000872,0.2130,0.906,114.617,245600.0,4.0,21.79874,14.0,0.0,60s,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47542,,,4uqh9bualXNHXXwO2wPorc,,,,,,,,,,,,,,,,,,Daisies,Katy Perry,173123.0,71.0,0.639,0.546,C# / Db,-5.382,major,0.0407,0.0837,0.000000,0.122,0.149,122.179,_TIKTOK,20s
47543,,,03f7xZmt2wHCIDJBFPK8G4,,,,,,,,,,,,,,,,,,Soap,Melanie Martinez,209426.0,68.0,0.595,0.689,F# / Gb,-6.107,minor,0.1480,0.2540,0.000005,0.129,0.213,168.112,_TIKTOK,10s
47544,,,4kIpBfvK44bxqX7zo8K1oP,,,,,,,,,,,,,,,,,,시작,Gaho,202440.0,67.0,0.591,0.818,E,-3.532,major,0.0730,0.1720,0.000000,0.126,0.574,108.107,_TIKTOK,20s
47545,,,1lNHWPDvKEbamKezpLq7HW,,,,,,,,,,,,,,,,,,no song without you,HONNE,160346.0,64.0,0.788,0.473,C,-12.744,major,0.0328,0.4890,0.254000,0.109,0.810,105.429,_TIKTOK,20s


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47547 entries, 0 to 47546
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   track_x             41560 non-null  object 
 1   artist_x            41560 non-null  object 
 2   track_id            47547 non-null  object 
 3   danceability_x      41560 non-null  float64
 4   energy_x            41560 non-null  float64
 5   key_x               41560 non-null  object 
 6   loudness_x          41560 non-null  float64
 7   mode_x              41560 non-null  object 
 8   speechiness_x       41560 non-null  float64
 9   acousticness_x      41560 non-null  float64
 10  instrumentalness_x  41560 non-null  float64
 11  liveness_x          41560 non-null  float64
 12  valence_x           41560 non-null  float64
 13  tempo_x             41560 non-null  float64
 14  duration_ms_x       41560 non-null  float64
 15  time_signature      41560 non-null  float64
 16  chor

In [45]:
# prepare merged dataset, so feature values are merged as well
features_to_combine = ["track", "artist", "duration_ms", 'danceability', 'energy', 'key', "loudness", "mode",
                       "speechiness", "acousticness", "instrumentalness","liveness", "valence", "tempo", 
                        "duration_ms", "era" ]
# Add more feature names to be combined as needed

# Combine the features from spotify and tik into the merged dataset
for feature in features_to_combine:
    data[feature] = data[f'{feature}_x'].fillna(data[f'{feature}_y'])

# Drop the duplicate features from merged_df
data.drop([f'{feature}_x' for feature in features_to_combine] + [f'{feature}_y' for feature in features_to_combine], axis=1, inplace=True)



In [46]:
# Export Spotify data to csv
data.to_csv('Spotify_TT.csv', index=False)

In [47]:
data

Unnamed: 0,track_id,time_signature,chorus_hit,sections,target,popularity,genre,track,artist,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,era
0,1dtKN6wwlolkM8XZy2y9C1,3.0,32.94975,9.0,1.0,,,Jealous Kind Of Fella,Garland Green,173533.0,0.417,0.620,D# / Eb,-7.727,major,0.0403,0.4900,0.000000,0.0779,0.845,185.655,60s
1,5hjsmSnUefdUqzsDogisiX,4.0,48.82510,10.0,0.0,,,Initials B.B.,Serge Gainsbourg,213613.0,0.498,0.505,D# / Eb,-12.475,major,0.0337,0.0180,0.107000,0.1760,0.797,101.801,60s
2,6uk8tI6pwxxdVTNlNOJeJh,4.0,37.22663,12.0,0.0,,,Melody Twist,Lord Melody,223960.0,0.657,0.649,F,-13.392,major,0.0380,0.8460,0.000004,0.1190,0.908,115.940,60s
3,7aNjMJ05FvUXACPWZ7yJmv,4.0,24.75484,8.0,0.0,,,Mi Bomba Sonó,Celia Cruz,157907.0,0.590,0.545,G,-12.058,minor,0.1040,0.7060,0.024600,0.0610,0.967,105.592,60s
4,1rQ0clvgkzWr001POOPJWx,4.0,21.79874,14.0,0.0,,,Uravu Solla,P. Susheela,245600.0,0.515,0.765,B,-3.515,minor,0.1240,0.8570,0.000872,0.2130,0.906,114.617,60s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47542,4uqh9bualXNHXXwO2wPorc,,,,,71.0,_TIKTOK,Daisies,Katy Perry,173123.0,0.639,0.546,C# / Db,-5.382,major,0.0407,0.0837,0.000000,0.1220,0.149,122.179,20s
47543,03f7xZmt2wHCIDJBFPK8G4,,,,,68.0,_TIKTOK,Soap,Melanie Martinez,209426.0,0.595,0.689,F# / Gb,-6.107,minor,0.1480,0.2540,0.000005,0.1290,0.213,168.112,10s
47544,4kIpBfvK44bxqX7zo8K1oP,,,,,67.0,_TIKTOK,시작,Gaho,202440.0,0.591,0.818,E,-3.532,major,0.0730,0.1720,0.000000,0.1260,0.574,108.107,20s
47545,1lNHWPDvKEbamKezpLq7HW,,,,,64.0,_TIKTOK,no song without you,HONNE,160346.0,0.788,0.473,C,-12.744,major,0.0328,0.4890,0.254000,0.1090,0.810,105.429,20s


## Data Preparation

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47547 entries, 0 to 47546
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          47547 non-null  object 
 1   time_signature    41560 non-null  float64
 2   chorus_hit        41560 non-null  float64
 3   sections          41560 non-null  float64
 4   target            41560 non-null  float64
 5   popularity        6764 non-null   float64
 6   genre             6764 non-null   object 
 7   track             47547 non-null  object 
 8   artist            47547 non-null  object 
 9   duration_ms       47547 non-null  float64
 10  danceability      47547 non-null  float64
 11  energy            47547 non-null  float64
 12  key               47547 non-null  object 
 13  loudness          47547 non-null  float64
 14  mode              47547 non-null  object 
 15  speechiness       47547 non-null  float64
 16  acousticness      47547 non-null  float6

In [49]:
# Transform the data to prepare it for usage in a machine learning model

# Clean duplicates based on track_id and popularity
data = tik.drop_duplicates(subset=['track_id', 'popularity'], keep='first')

# Drop unnecessary columns
data = data.drop(['track_id', "target" ], axis=1)

data.info()

KeyError: "['target'] not found in axis"

In [None]:
# Speichern von data als csv
#data.to_csv("Spotify_TT.csv")

## Data Modelling

In [None]:
import xgboost as xgb  
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error


In [None]:
data = pd.get_dummies(data)

In [None]:
# Filter the data for the specific product
song_data = data
    
# Split the data into features (X) and target (y)
X = song_data.drop(['popularity'], axis=1)
y = song_data['popularity']
    


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Define the XGBoost model
model = RandomForestRegressor()
    
# Train the model on the training data
model.fit(X_train, y_train)  


RandomForestRegressor()

In [None]:
# Make predictions on the testing data
y_pred = model.predict(X_test)
    
# Calculate the mean squared error of the model
MAPE = mean_absolute_percentage_error(y_test, y_pred)
R2 = model.score(X_test, y_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
print (MAPE, R2, RMSE)

1.1026242938151858e+16 0.27630786685438113 21.006619956003583
