# TikTok scraper
Scrape tiktok trending page and extract posts information

  ## Imports

In [103]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd

## Setting up the dataframes

In [104]:
# Posts DF
posts_columns =  ['user_id', 'post_desc', 'song', 'nb_likes', 'nb_comments', 'nb_shares']
posts_df  = pd.DataFrame(columns = posts_columns)

# Users DF
users_columns =  ['user_id', 'user_desc', 'nb_followings', 'nb_followers', 'nb_likes']
users_df  = pd.DataFrame(columns = users_columns)

## Setting up the scraper

In [105]:
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument("--headless")
#chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent='Applebot'")
driver = webdriver.Chrome(r"./chromedriver", options=chrome_options)
driver.get("https://www.tiktok.com/trending")

## Scraper

### Scroll

In [None]:
SCROLL_PAUSE_TIME = 1

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

# Scrolling 20 times
for i in range(3):
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    print(i)

### picking elements

In [None]:
main_window = driver.current_window_handle
items = driver.find_elements(By.CLASS_NAME, 'video-feed-item')
i = 0
for post in items:
    i += 1
    # Picking post elements
    user_id = post.find_element_by_class_name('author-uniqueId').text
    try:
        title = post.find_element(By.CLASS_NAME, 'item-meta-title')
    except NoSuchElementException:
        post_desc = ''
    else:
        title = title.find_elements_by_xpath('.//strong')
        post_desc = ' '.join([el.text for el in title])
    song = post.find_element(By.CLASS_NAME, 'music-title-decoration').text
    nb_likes = post.find_element_by_css_selector("[title^='like']").text
    nb_comments = post.find_element_by_css_selector("[title^='comment']").text
    nb_shares = post.find_element_by_css_selector("[title^='share']").text

    # Appending post info to post df
    posts_df = posts_df.append(pd.DataFrame([[user_id, post_desc, song, nb_likes, nb_comments, nb_shares]], columns
    =posts_df.columns))
    
    # switching to the user page
    driver.execute_script("window.open('http://www.tiktok.com/@{}', 'new_window')".format(user_id))
    WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
    driver.switch_to.window(driver.window_handles[1])
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "share-desc")))
    except TimeoutException:
        driver.close()
        driver.switch_to.window(main_window)
        print(i)
        continue
    
    # getting the user info
    try:
        user_desc = driver.find_element(By.CLASS_NAME, 'share-desc').text
    except NoSuchElementException:
        user_desc = ''
        
    try:
        nb_followings = driver.find_element_by_css_selector("[title^='Followings']").text
        
    except NoSuchElementException:
        nb_followings = driver.find_element_by_css_selector("[title^='Following']").text
    
    nb_followers = driver.find_element_by_css_selector("[title^='Followers']").text

    nb_likes = driver.find_element_by_css_selector("[title^='Likes']").text
    
    # Appending user info to user df
    users_df = users_df.append(pd.DataFrame([[user_id, user_desc, nb_followings, nb_followers, nb_likes]], columns
    =users_df.columns))
    
    # closing the user page 
    driver.close()
    driver.switch_to.window(main_window)
    print(i)

In [None]:
posts_df

In [None]:
users_df

In [None]:
driver.close()