# TikTok Data Preprocessing

In this notebook, the tiktok data for all three datasets is imported from json format then cleaned, tidied up, and saved as both a pickle object and a dataframe.

In [1]:
import json
from nltk import *
import re
import pandas as pd
import numpy as np

In [2]:
'''
@author osamah Abdelhaq last edit: 6/17/2020
'''

'\n@author osamah Abdelhaq last edit: 6/17/2020\n'

In [3]:
#delete every "New Post" column
def NewPost(data):
    j=0
    while j < len(data):
        del data[j]
        j+=1
    return data

In [4]:
def splits(data):
    splits = []
    '''
    deletes unwanted data
    splits has every post split by \n
    split[0] returns the first post
    split[0][1] returns the first sentence of the first post 
    '''
    #replace these irrelevant words/hyperlinks and replace with a space
    #then split by \n
    i=1
    for i in range(len(data)):
        data[i]=re.sub('Suggested accounts','',data[i])
        data[i]=re.sub('Report','',data[i])
        data[i]=re.sub('Follow','',data[i])
        data[i]=re.sub('Hide','',data[i])
        data[i]=re.sub('videos','',data[i])
        data[i] = re.sub('View more','',data[i])
        data[i] = re.sub('\\\w{5}\w?','',data[i])
        sent = data[i].strip().split('\n')
        splits.append(sent)
        i+=1

    j=0
    while j < len(splits):
        #ignore arbitrary page labeling
        del(splits[j][0:3])
        del(splits[j][-50:])
        j+=1
        
    return splits

In [5]:
#deleting arbitrary columns
def arbitrary(splits):
    '''returns list of data without arbitrary website information'''
    del splits[10]
    del splits[12-1]
    del splits[14-2]
    del splits[17-3]
    del splits[26-4]
    return splits

In [6]:
def videoInformation(post):
    '''
    returns list of videoInformation
    first dimension index returns video information of that post. ie. video_information[0] = post[0]
    includes: creator username, post date, caption, and view count
    this function assumes the first K in the dataset is the views, exceptions have been fixed manually
    '''
    idx = [i for i, s in enumerate(post) if 'K' in s]
    inf = post[:idx[0]+1]
    return inf 

In [7]:
def getComments(splits,video_information):
    '''separates video information from the comments'''
    all_comments=[]
    for i in range(len(splits)):
        idx = len(video_information[i])
        post_comments = splits[i][idx:]
        all_comments.append(post_comments)
    return all_comments

In [8]:
def emptyStrings(all_comments):
    '''removes empty strings'''
    #remove empty strings
    all_content=[]
    t=0
    x=0
    for t in range(len(all_comments)):
        for x in range(len(all_comments[t])):
            if (all_comments[t][x]!=''):
                all_content.append(all_comments[t][x])
            x+=1
        t+=1
    return all_content

In [9]:
def pd_Frame(all_content):
    '''returns dataframe by separating into name, comment, and date'''
    name = []
    comment = []
    date = []

    name.clear()
    comment.clear()
    date.clear()

    for i in range(0,len(all_content),3):
        n = all_content[i]
        if ('View more' and 'View replies' not in n):
            name.append(n)

    for j in range(1,len(all_content),3):
        c = all_content[j]
        if ('View more' and 'View replies' not in c):
            comment.append(c)

    for k in range(2,len(all_content),3):
        d = all_content[k]
        if ('View more' and 'View replies' not in d):    
            date.append(d)
    
    frame = pd.DataFrame({'name':name,'text':comment,'date':date})
    
    return frame

In [11]:
#read in data
with open('Tik_Tok_Stimulus_Check') as f:
    content = json.load(f)

'''
returns data frame of data with three columns, name, text, and date
returns video_information list that contains information on each video
first dimension index returns video information of that post. ie. video_information[0] = post[0]
includes: creator username, post date, caption, and view count

'''
    
#delete where 'New Post' is marked
content = NewPost(content)

#parse data
content = splits(content)

#remove unintentionally collected, irrelevant data
content = arbitrary(content)

#gathering video information
stimulus_check_video_information = []

i=0

for i in range(len(content)):
    inf=videoInformation(content[i])
    stimulus_check_video_information.append(inf)
    
#parse comments from video information    
comments = getComments(content,stimulus_check_video_information)

#remove empty strings
all_content = emptyStrings(comments)

#correcting data parsed incorrectly
all_content[4050] = ' '.join(all_content[4050:4053])

all_content[4050] = ' '.join(all_content[4050:4051])

del all_content[4051]
del all_content[4051]

#deleting any incidents where view replies was not clicked during the scraping process
idx=[i for i, s in enumerate(all_content) if 'View replies' in s]
counter = 0
for dex in idx:
    del all_content[dex-counter]
    counter+=1
    
#correcting parsing errors
all_content[7186] = ' '.join(all_content[7186:7188])

del all_content[7187]
del all_content[7187]

del all_content[6357]
del all_content[6357]

all_content[6490] = ' '.join(all_content[6490:6492])

del all_content[6491]

del all_content[7320:7322]

del all_content[7625]

del all_content[7659]

del all_content[7848:7850]

stimulus_check_frame = pd_Frame(all_content)

#save dataframe as pickle object
stimulus_check_frame.to_pickle('stimulus_tiktok_dataframe.p')

'''
emojis do not translate into csv, become converted into special characters
'''
stimulus_check_frame.to_csv(r'C:\Users\osama\Desktop\Stimulus_Check_DF.csv')

In [12]:
print(stimulus_check_frame.info())
print(stimulus_check_frame.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2913 entries, 0 to 2912
Data columns (total 3 columns):
name    2913 non-null object
text    2913 non-null object
date    2913 non-null object
dtypes: object(3)
memory usage: 68.4+ KB
None
                   name                                               text  \
0  HI if you’re viewing  lmao bruh I’m smacked and I watched this and c...   
1         Ed Edd n Eddy                                     I felt that 💀💀   
2                    <3                                      @thugg_waffle   
3                     🦈                             This is the best video   
4            Ian Daniel                                               Same   

   date  
0  4-19  
1  4-21  
2  4-27  
3  4-27  
4  4-28  


In [13]:
#read in data
with open('Tik_Tok_Unemployment') as f:
    data = json.load(f)
    

'''
returns data frame of data with three columns, name, text, and date
returns video_information list that contains information on each video
first dimension index returns video information of that post. ie. video_information[0] = post[0]
includes: creator username, post date, caption, and view count

'''
   
#delete where 'New Post' is marked
data = NewPost(data)

#parse the data
data = splits(data)

#remove unintentionally collected, irrelevant data
del data[1]
del data[3-1]
del data[6-2]
del data[11-3]
del data[16-4]
del data[21-5]

#gathering video information
#somewhat automated but errors in data collection forced me to manually slice some of the data
unemployment_video_information = []

for i in range(9):
    inf=videoInformation(data[i])
    unemployment_video_information.append(inf)
    
idx = 6
unemployment_video_information.append(data[9][:idx+1])
unemployment_video_information.append(data[10][:idx])

for i in range(11,16):
    inf=videoInformation(data[i])
    unemployment_video_information.append(inf)

unemployment_video_information.append([16][:idx])
unemployment_video_information.append([17][:idx])
unemployment_video_information.append(data[18][:idx])
unemployment_video_information.append(data[19][:idx])
unemployment_video_information.append(data[20][:idx+1])
unemployment_video_information.append(data[21][:idx+1])
unemployment_video_information.append(data[22][:idx])
unemployment_video_information.append(data[23][:idx])

#parse comments from the video information
comment = getComments(data,unemployment_video_information)

#remove empty strings
all_cont = emptyStrings(comment)

#deleting any incidents where view replies was not clicked during the scraping process
indx=[i for i, s in enumerate(all_cont) if 'View replies' in s]

count = 0

for dex in indx:
    del all_cont[dex-count]
    count+=1
    
#correcting parsing errors
del all_cont[434]

all_cont[2341] = ' '.join(all_cont[2341:2343])

del all_cont[2342]

del all_cont[2997:3001]

del all_cont[3045:3047]

del all_cont[3081:3085]

all_cont[3226] = ' '.join(all_cont[3226:3228])

del all_cont[3227]

all_cont[3259] = ' '.join(all_cont[3259:3262])

del all_cont[3260:3262]

del all_cont[3541]

#convert into dataframe
unemployment_frame = pd_Frame(all_cont)

#save dataframe as pickle object
unemployment_frame.to_pickle('unemployment_tiktok_dataframe.p')

'''
emojis do not translate into csv, become converted into special characters
'''
unemployment_frame.to_csv(r'C:\Users\osama\Desktop\Unemployment_DF.csv')

In [14]:
print(unemployment_frame.info())
print(unemployment_frame.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 3 columns):
name    1898 non-null object
text    1898 non-null object
date    1898 non-null object
dtypes: object(3)
memory usage: 44.6+ KB
None
                          name  \
0         Heather MacIvor Moul   
1                        Peace   
2         Heather MacIvor Moul   
3  Cathryn Patterson · Creator   
4            user1156407460130   

                                                text  date  
0                         cannot hear you bc lashes.   4-8  
1  If you close your eyes they turn all the way d...   4-8  
2                                          good tip!  4-13  
3                                         thank you!  4-13  
4  Lol stop she’s giving good tips and looks so g...  4-19  


In [15]:
#read in data
with open('Tik_Tok_USPS') as f:
    collection = json.load(f)
    
''''
returns data frame of data with three columns, name, text, and date
returns video_information list that contains information on each video
first dimension index returns video information of that post. ie. video_information[0] = post[0]
includes: creator username, post date, caption, and view count
'''

#parse the data
collection = NewPost(collection)
collection = splits(collection)

#remove unintentionally collected, irrelevant data
del (collection[1][0])
del collection[0]
del collection[13-1]
del collection[17-2]
del collection[20-3]
del collection[26-4]

#gathering video information
#somewhat automated but errors in data collection forced me to manually slice some of the data
usps_video_information = []

usps_video_information.append(videoInformation(collection[0]))
usps_video_information.append(collection[1][:6])

for i in range(2,5):
    inf=videoInformation(collection[i])
    usps_video_information.append(inf)

usps_video_information.append(collection[5][:6])
usps_video_information.append(collection[6][:6])
usps_video_information.append(collection[7][:6])

for i in range(8,11):
    inf=videoInformation(collection[i])
    usps_video_information.append(inf)

usps_video_information.append(collection[11][:6])

for i in range(12,14):
    inf=videoInformation(collection[i])
    usps_video_information.append(inf)

usps_video_information.append(collection[14][:6])

for i in range(15,22):
    inf=videoInformation(collection[i])
    usps_video_information.append(inf)

usps_video_information.append(collection[22][:6])
usps_video_information.append(collection[23][:6])
usps_video_information.append(collection[24][:6])

#parse comments from the video information
comment = getComments(collection,usps_video_information)

#remove empty strings
all_collection = emptyStrings(comment)

#deleting any incidents where view replies was not clicked during the scraping process
idex=[i for i, s in enumerate(all_collection) if 'View replies' in s]

count = 0

for i in idex:
    del all_collection[i-count]
    count+=1
    
#correcting parsing errors
del all_collection[351:353]

all_collection[2758] = ' '.join(all_collection[2758:2760])
del all_collection[2759]

del all_collection[2925:2927]

del all_collection[3210:3212]

del all_collection[3255:3257]

#convert into dataframe
usps_frame = pd_Frame(all_collection)

#save dataframe as pickle object
usps_frame.to_pickle('usps_tiktok_dataframe.p')

'''
emojis do not translate into csv, become converted into special characters
'''
usps_frame.to_csv(r'C:\Users\osama\Desktop\USPS_DF.csv')

In [16]:
print(usps_frame.info())
print(usps_frame.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335 entries, 0 to 1334
Data columns (total 3 columns):
name    1335 non-null object
text    1335 non-null object
date    1335 non-null object
dtypes: object(3)
memory usage: 31.4+ KB
None
                  name                                               text  \
0  U.S. Postal Service  Alright that’s pretty cool, how do we apply th...   
1                Alvez                 isn't Trump gonna dissolve y'all??   
2                C.Erk                                             Dead 😂   
3    UhaveAlotOfTalent  Hopefully pos postal service private is way be...   
4                 Andy  Honestly if showed this to trump supporters th...   

   date  
0  5-13  
1  5-14  
2  5-14  
3  5-14  
4  5-14  
