In [None]:
import sys
sys.path.append('..')

import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from src.plot_utils import hist, count, scatter
from dotenv import load_dotenv

In [None]:
# basic data analysis - see https://www.kaggle.com/code/antoniosabatini/tiktok-popularity-track-eda-ml-models/notebook
# for info on audio features see https://developer.spotify.com/documentation/web-api/reference/get-audio-features

CSV_PATH = '../data/chartex_clean.csv'

df = pd.read_csv(CSV_PATH)
df.info()

In [None]:
#dicard string type features
df = df.drop(['track_name', 'artist', 'album', 'id', 'song_name', 'artist_name'], axis = 1)

# basic statistics
print(df.describe())

In [None]:
# Count the number of unique values in each column
print((df.nunique()/len(df)).sort_values())

## Distribution of the numerical and categorical columns

In [None]:
# distribution graphs for the numerical columns

cols = ['artist_pop', 'track_pop', 'danceability', 'energy', 
        'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
        'liveness', 'valence', 'tempo', 'duration_ms']

fig, axs = plt.subplots(4, 3, figsize=(15, 15))

for i, col in enumerate(cols):
    
    row_index = i // 3
    col_index = i % 3
    
    hist(df, col, axs[row_index][col_index])

fig.suptitle("Histograms of numeric columns of the Dataset", fontsize="xx-large", y=0.92)
    
plt.show()


In [None]:
# distribution graphs for categorical columns

cols = ['key', 'mode', 'time_signature']

fig, ax = plt.subplots(1, 3, figsize=(30, 7))

for i, col in enumerate(cols):

    count(df, col, ax[i])
    
fig.suptitle("Count of values for categorical columns", size="xx-large")

plt.show()

We can see that for the most part, the songs from this dataset, which are all relatively popular on Tiktok (more than 100k videos) are:
- Energetic, high tempo, loud, and danceable
- Mainly music and not speech
- Under 5 minutes in length
- Recorded in a Studio (not live)
- Not entirely acoustic
- Overwhelmingly not instrumental (contain words)
- have 4/4 time signature

But are these features enough to distinguish the songs that go extremely viral from all the rest?
Of course extremely viral is subjective-for example we can use 10 million videos made as the threshold, or 1 million or 50 million.
For reference, the most popular song on tik tok has ~35 million videos.

## Correlation between all pairs:

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))

corr = df.corr()
    
sns.heatmap(corr, annot=True, ax=ax, square=True, linewidth=.5, vmin=-1, vmax=1, fmt=".2f")
    

few takeovers:

* artist_pop has the highest correlation with track_pop.
* total_likes_count has the highest correlation with number_of_videos. However, if we know the total number of likes, it is easy to estimate the number of videos and vice versa, meaning they are both hard.
* length and duration_ms has the maximum correlation (make sense).
* all other pairs has very low correlation.

# Useless with the heatmap above:

## Correlation of number of tiktoks with spotify track popularity and spotify artist popularity


In [None]:
cols = ['track_pop', 'artist_pop']
fig, axs = plt.subplots(1,2,  figsize=(40, 15))

for i, col in enumerate(cols):
    
    col_index = i 
    
    ax = axs[col_index]
    
    scatter(df, 'number_of_videos', col, ax)
    

plt.show()
    

We can see that interestingly there is no strong correlation between popularity on spotify and popularity on tik tok.

## Correlation of audio features with tiktok popularity


In [None]:
cols = ['danceability', 'energy', 'loudness', 'speechiness','acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

fig, axs = plt.subplots(5, 2, figsize=(15, 40))

for i, col in enumerate(cols):
    
    row_index = i // 2
    col_index = i % 2
    
    ax = axs[row_index][col_index]
    
    scatter(df, 'number_of_videos', col, ax)
    

plt.show()
    

We can see that the correlations are rather weak and not enough to help distinguish viral songs from the rest.