In [1]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/tiktok/tiktok_dataset.csv', index_col=['#'])
df.head()

Unnamed: 0_level_0,claim_status,video_id,video_duration_sec,video_transcription_text,verified_status,author_ban_status,video_view_count,video_like_count,video_share_count,video_download_count,video_comment_count
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,claim,7017666017,59,someone shared with me that drone deliveries a...,not verified,under review,343296.0,19425.0,241.0,1.0,0.0
2,claim,4014381136,32,someone shared with me that there are more mic...,not verified,active,140877.0,77355.0,19034.0,1161.0,684.0
3,claim,9859838091,31,someone shared with me that american industria...,not verified,active,902185.0,97690.0,2858.0,833.0,329.0
4,claim,1866847991,25,someone shared with me that the metro of st. p...,not verified,active,437506.0,239954.0,34812.0,1234.0,584.0
5,claim,7105231098,19,someone shared with me that the number of busi...,not verified,active,56167.0,34987.0,4110.0,547.0,152.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19382 entries, 1 to 19382
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   claim_status              19084 non-null  object 
 1   video_id                  19382 non-null  int64  
 2   video_duration_sec        19382 non-null  int64  
 3   video_transcription_text  19084 non-null  object 
 4   verified_status           19382 non-null  object 
 5   author_ban_status         19382 non-null  object 
 6   video_view_count          19084 non-null  float64
 7   video_like_count          19084 non-null  float64
 8   video_share_count         19084 non-null  float64
 9   video_download_count      19084 non-null  float64
 10  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(2), object(4)
memory usage: 1.8+ MB


In [3]:
df.nunique()

claim_status                    2
video_id                    19382
video_duration_sec             56
video_transcription_text    19012
verified_status                 2
author_ban_status               3
video_view_count            15632
video_like_count            12224
video_share_count            9231
video_download_count         4336
video_comment_count          2424
dtype: int64

In [4]:
from plotly.express import bar
for column in ['claim_status', 'verified_status', 'author_ban_status']:
    bar(data_frame=df[column].value_counts().to_frame().reset_index(), x=column, y='count').show()

In [5]:
from plotly.express import histogram
for column in df.columns:
    if 'count' in column:
        histogram(data_frame=df, x=column, log_y=True, color='claim_status').show()
    elif 'duration' in column:
        histogram(data_frame=df, x=column, log_y=False, color='claim_status').show()

The durations are essentially uniformly distributed, while the various counts are mostly zero. And we see the trends that we expect to see, very broadly speaking, in social media data: views > likes > comments; this is the first time I've seen download data, but I guess it's not surprising that shares > downloads > comments. Generally we expect that the more effort something takes the fewer people will do it.

The fact that the status = opinion videos are all concentrated near the zero engagement point suggests they are being throttled somehow.

In [6]:
from plotly.express import imshow
imshow(img=df[[column for column in df.columns if 'count' in column or 'duration' in column]].corr())

These engagement correlations are pretty high; I think the lowest is .55, which surprises me even if it isn't surprising.

In [7]:
from plotly.express import scatter
scatter(data_frame=df.sample(n=2500, random_state=2023), x='video_view_count', y='video_like_count', size='video_duration_sec',
        opacity=0.2, hover_name='video_id', color='verified_status')

In [8]:
df.columns

Index(['claim_status', 'video_id', 'video_duration_sec',
       'video_transcription_text', 'verified_status', 'author_ban_status',
       'video_view_count', 'video_like_count', 'video_share_count',
       'video_download_count', 'video_comment_count'],
      dtype='object')

This is consistent with what we see on other social media platforms, that most of the content, and in particular most of the high-engagement content is made by unverified users; this may be why the platforms don't make more of an effort to eliminate anonymous users.

In [9]:
scatter(data_frame=df.sample(n=2500, random_state=2023), x='video_share_count', y='video_download_count', size='video_duration_sec',
        opacity=0.2, hover_name='video_id', color='verified_status', log_x=True, log_y=True, trendline='ols', trendline_scope='overall')

In [10]:
scatter(data_frame=df.sample(n=2500, random_state=2023), x='video_share_count', y='video_comment_count', size='video_duration_sec',
        opacity=0.2, hover_name='video_id', color='verified_status', log_x=True, log_y=True, trendline='ols', trendline_scope='overall')

Other engagements are typically dependent on views, so we see a triangular covariance; not so with the other engagement metrics.