In [None]:
#pip install wordsegment

In [1]:
import numpy as np
import sqlite3
import pandas as pd
import re

import wordsegment
from wordsegment import load, segment
load()

from plotly import express as px

In [111]:
conn = sqlite3.connect("tiktok.db")

cmd = \
    f"""
    SELECT id, video_title, creator_nickname, upload_time, sound_transcribed, like, share, comment, view
    FROM tiktok
    """ 
    
tiktoks = pd.read_sql_query(cmd, conn)
    
conn.close()

In [110]:
def clean_tiktok_df(tiktoks):
    
    #fix column 'upload_time' -> 'video_url'
    tiktoks = tiktoks.rename(columns = {'upload_time' : 'video_url'})
    
    #remove duplicates
    dups = tiktoks['id'].duplicated(keep = 'last') #keep the most recent version of the tiktok
    tiktoks = tiktoks[np.invert(dups)]
    tiktoks = tiktoks.reset_index(drop = True)
    
    #remove rows with no video_title and no sound_transcribed
    tiktoks = tiktoks[(tiktoks["sound_transcribed"] != 'NA') | (tiktoks["video_title"] != '')]
    
    #replace 'NA' in sound_transcribed with ''
    tiktoks["sound_transcribed"] = [sound if sound != 'NA' else '' for sound in tiktoks["sound_transcribed"]]
    tiktoks = tiktoks.reset_index(drop = True)
    
    #replace hashtag phrase with predicted phrase with spaces added
    tiktoks['hashtags'] = [re.findall(r"#(\w+)",x) for x in tiktoks['video_title'] ] #create col for list of hashtag phrases
    tiktoks['predicted_hashtag_words'] = [segment(' '.join(x)) for x in tiktoks['hashtags']] #create col for list of predicted phrases of each hashtag
    tiktoks['predicted_hashtag_words'] = [' '.join(x) for x in tiktoks['predicted_hashtag_words']] #list -> string
    tiktoks['original_video_title'] = tiktoks['video_title'] #keep original title in new column
    tiktoks['video_title'] = [re.sub("#[A-Za-z0-9_]+","", x) for x in tiktoks['video_title']] #remove hashtagged phrases from title
    tiktoks['video_title'] = tiktoks['video_title'] + tiktoks['predicted_hashtag_words'] #new title where hashtag phrases are replaced with their predicted words
    
    return tiktoks

In [112]:
tiktoks = clean_tiktok_df(tiktoks)

In [87]:
tiktoks.head(10)

Unnamed: 0,id,video_title,creator_nickname,video_url,sound_transcribed,like,share,comment,view,hashtags,predicted_hashtag_words
0,7061263400738802990,trending fyp manga anime kill ua anime ti...,K,https://v16-webapp.tiktok.com/3459683bb68c8979...,okay right,22000,319,97,91600,[],
1,7047633830370364719,Reply to @whosaleece N <3,user,https://v16-webapp.tiktok.com/85adc4b700a00c26...,,6500000,68800,157000,31600000,[],
2,7055138775889906991,Reply to @that_one_killer Why does Adrian loo...,Nic Suarez,https://v16-webapp.tiktok.com/1fcb88b9fe227ed2...,,537800,1123,2815,4800000,[],
3,7060751888899378478,Follow for more 😂❤️,Pain Ezra🔥,https://v16-webapp.tiktok.com/86345f8907915192...,,19200,1872,6180,244000,[],
4,7048757416300645637,How 2022 started….,Gashi,https://v16-webapp.tiktok.com/fd08417e18500465...,,6300000,168500,110700,52200000,[],
5,7059114604395482415,f naf vanny,Jake Fellman,https://v16-webapp.tiktok.com/374e04a7da44abc6...,,567100,1645,4037,9400000,[],
6,7061443118813515054,February 9th game day . winter olympics olym...,maddie_mastro,https://v16-webapp.tiktok.com/22a1f986f81857e5...,,104100,167,495,1100000,[],
7,7060467798178286895,First qualies of the Games 🔥 fyp for you...,U.S. Ski & Snowboard,https://v16-webapp.tiktok.com/d6764d781a9ed574...,good,271500,471,760,2600000,[],
8,7060861444266708271,When you make it to your second Olympics and y...,maddie_mastro,https://v16-webapp.tiktok.com/05d6d165a0289994...,,39800,88,316,508700,[],
9,7042688768872123649,Frying Real Human Eggs 🍳 funny videos japan...,memes,https://v16-webapp.tiktok.com/05230af57fddf570...,,1600000,38300,5649,38600000,[],


In [78]:
tiktoks.shape

(389, 11)

In [7]:
#examples of word segementing:
print(segment('fyp bts btsarmy btsconcert2021 taehyung ptdonstage btsv')) #doesn't work that well here
print(segment('sinkspetic plumbersoftiktok')) #doesn't work well on first hashtag, works well on second
print(segment('SephoraLipLooks')) #works well
print(segment('singing loveisgone')) #works well

['fypbtsbtsarmybts', 'concert', '2021taehyungptdonstagebt', 'sv']
['sinks', 'pet', 'ic', 'plumbers', 'of', 'tik', 'tok']
['sephora', 'lip', 'looks']
['singing', 'love', 'is', 'gone']


In [227]:
segment('fyp bts btsarmy btsconcert2021 taehyung ptdonstage btsv')

['fypbtsbtsarmybts', 'concert', '2021taehyungptdonstagebt', 'sv']

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


# Sentiment Analysis

In [16]:
#pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [23]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_sentiment(tiktoks):
    #combine title and sound_transcribed
    tiktoks['title_and_sound'] = tiktoks['video_title'] + ' ' + tiktoks['sound_transcribed']
    sid = SentimentIntensityAnalyzer()

    tiktoks['Negative Sentiment'] = tiktoks['title_and_sound'].apply(lambda x: sid.polarity_scores(x)['neg'])
    tiktoks['Neutral Sentiment'] = tiktoks['title_and_sound'].apply(lambda x: sid.polarity_scores(x)['neu'])
    tiktoks['Positive Sentiment'] = tiktoks['title_and_sound'].apply(lambda x: sid.polarity_scores(x)['pos'])
    tiktoks['Compound Sentiment'] = tiktoks['title_and_sound'].apply(lambda x: sid.polarity_scores(x)['compound'])
    return tiktoks


In [113]:
tiktoks = get_sentiment(tiktoks)

In [10]:
tiktoks.head(10)

Unnamed: 0,id,video_title,creator_nickname,sound_transcribed,like,share,comment,view,hashtags,predicted_hashtag_words,title_and_sound,Negative Sentiment,Neutral Sentiment,Positive Sentiment,Compound Sentiment
0,7061263400738802990,trending fyp manga anime kill ua anime ti...,K,okay right,22000,319,97,91600,"[trending, fyp, manga, anime, killua, animetik...",trending fyp manga anime kill ua anime tik tok,trending fyp manga anime kill ua anime ti...,0.301,0.577,0.122,-0.5859
1,7047633830370364719,Reply to @whosaleece N <3,user,,6500000,68800,157000,31600000,[],,Reply to @whosaleece N <3,0.0,0.508,0.492,0.4404
2,7055138775889906991,Reply to @that_one_killer Why does Adrian loo...,Nic Suarez,,537800,1123,2815,4800000,"[trend, brother, edit, mirror, CloseYourRings,...",trend brother edit mirror close your rings not...,Reply to @that_one_killer Why does Adrian loo...,0.0,1.0,0.0,0.0
3,7060751888899378478,Follow for more 😂❤️,Pain Ezra🔥,,19200,1872,6180,244000,[],,Follow for more 😂❤️,0.0,1.0,0.0,0.0
4,7048757416300645637,How 2022 started….,Gashi,,6300000,168500,110700,52200000,[],,How 2022 started….,0.0,1.0,0.0,0.0
5,7059114604395482415,f naf vanny,Jake Fellman,,567100,1645,4037,9400000,"[FNaF, Vanny]",f naf vanny,f naf vanny,0.0,1.0,0.0,0.0
6,7061443118813515054,February 9th game day . winter olympics olym...,maddie_mastro,,104100,167,495,1100000,"[winterolympics, olympicspirit, womeninsports]",winter olympics olympic spirit women in sports,February 9th game day . winter olympics olym...,0.0,0.855,0.145,0.1779
7,7060467798178286895,First qualies of the Games 🔥 fyp for you...,U.S. Ski & Snowboard,good,271500,471,760,2600000,"[fyp, foryou, skitok, beijing2022, DuetDoWet, ...",fyp for you ski to kbeijing2022duetdowet winte...,First qualies of the Games 🔥 fyp for you...,0.0,0.828,0.172,0.4404
8,7060861444266708271,When you make it to your second Olympics and y...,maddie_mastro,,39800,88,316,508700,"[winterolympics, olympicspirit]",winter olympics olympic spirit,When you make it to your second Olympics and y...,0.0,0.85,0.15,0.4019
9,7042688768872123649,Frying Real Human Eggs 🍳 funny videos japan...,memes,,1600000,38300,5649,38600000,"[funnyvideos, japan, comedy, viral, amazing]",funny videos japan comedy viral amazing,Frying Real Human Eggs 🍳 funny videos japan...,0.0,0.432,0.568,0.8481


In [92]:
tiktoks.shape[0]

389

In [114]:
fig = px.scatter(tiktoks,
                 x = 'Positive Sentiment',
                 y = 'Negative Sentiment',
                 color = 'Compound Sentiment',
                 color_continuous_scale = ['rgb(255,0,80)', 'rgb(255,255,255)', 'rgb(0,242,234)'],
                 range_color = (-1.1,1.1),
                 template = 'plotly_dark',
                 hover_name = 'original_video_title',
                 hover_data = ['Neutral Sentiment', 'Compound Sentiment', 'like', 'sound_transcribed'],
                 size = 'like',
                 size_max = 10,
                 width = 800,
                 height = 600)
# fig.update_xaxes(range=[-0.1,1.1])
# fig.update_yaxes(range=[-0.1,1.1])
fig.update_traces(marker_sizemin = 3)

## Add video links to points
# plotAnnotes = []
# for i in np.arange(tiktoks.shape[0]):
#     plotAnnotes.append(dict(x = tiktoks['Positive Sentiment'][i],
#                             y = tiktoks['Negative Sentiment'][i],
#                             text = """<a href="{}">{}</a>""".format(tiktoks['video_url'][i]," "),
#                             showarrow = False,
#                             xanchor = 'center',
#                             yanchor = 'bottom'))
#fig.update_layout(annotations = plotAnnotes)

fig.show()

In [14]:
for title in tiktoks["video_title"][:10]:
    sid = SentimentIntensityAnalyzer()
    print("----------")
    print(title)
    ss = sid.polarity_scores(title)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

----------
     trending fyp manga anime kill ua anime tik tok
compound: -0.6908, neg: 0.37, neu: 0.63, pos: 0.0, 
----------
Reply to @whosaleece N <3
compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42, 
----------
Reply to @that_one_killer  Why does Adrian look cooler than me in this tho... @thaflex      trend brother edit mirror close your rings not viral
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
----------
Follow for more 😂❤️
compound: 0.3892, neg: 0.155, neu: 0.573, pos: 0.272, 
----------
How 2022 started….
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
----------
 f naf vanny
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
----------
February 9th game day .   winter olympics olympic spirit women in sports
compound: 0.1779, neg: 0.0, neu: 0.866, pos: 0.134, 
----------
First qualies of the Games 🔥       fyp for you ski to kbeijing2022duetdowet winter olympics moguls
compound: -0.34, neg: 0.146, neu: 0.854, pos: 0.0, 
----------
When you make it to your second Olympics and you are 

In [15]:
sid.polarity_scores("💋 and 😁")

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.7003}

In [22]:
sid.polarity_scores("❤️")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [26]:
sid.polarity_scores("😂")

{'neg': 0.218, 'neu': 0.345, 'pos': 0.437, 'compound': 0.4404}

In [115]:
sid.polarity_scores("😂😂😂😂😂")

{'neg': 0.218, 'neu': 0.345, 'pos': 0.437, 'compound': 0.926}

In [27]:
sid.polarity_scores("🔥")

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.34}