# <center>Scraping twitch tracker for the top 500 streamers for the top 10 games</center>
***

In [1]:
import pandas as pd
import numpy as np
import bs4
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

pd.options.display.float_format = '{:.0f}'.format
chrome_options = Options()
chrome_options.add_argument('--headless')
path_to_driver = '/usr/bin/chromedriver'
chrome = webdriver.Chrome(path_to_driver, options=chrome_options)

In [2]:
def save_df_as_csv(df):
    import os
    path = r'./data/top_streamers.csv'
    
    if not os.path.exists(r'./data'):
        os.mkdir(r'./data')
    
    if os.path.isfile(path):
        os.remove(path)
        df.to_csv(path, index=False)
        return 'file overriden'
    else:
        df.to_csv(path, index=False)
        return 'saved'

In [3]:
def load_csv(path):
    import os
    if not os.path.isfile(path):
        raise Exception('path does not exist')
    return pd.read_csv(path)

In [9]:
def get_top_n_most_subscribed_streamers(driver=None, n=10):
    if driver is None:
        raise Exception('You have to provide a web driver')
        
    headers = ['#', 'streamer_name']
    result = []
    url = 'https://twitchtracker.com/subscribers?page='
    for i in range(n):
        print('extracting page ' + str(i+1))
        driver.get(url + str(i+1))
        html = driver.execute_script('return document.body.innerHTML;')
        soup = bs(html, 'lxml')
        for j in range(21):
            if j+1 == 11:
                continue
            temp = []
            streamer_position = soup.table.find_all('tr')[j+1].find_all('td')[0].text.replace('#', '')
            streamer_name = soup.table.find_all('tr')[j+1].find_all('td')[3].a.text
            temp.append([streamer_position, streamer_name])
            result.append(temp)
        time.sleep(3)
    result = np.array(result)
    result = result.squeeze()
    return pd.DataFrame(result, columns=headers)

In [10]:
def get_streamer_monthly_info(driver=None, names=''):
    if driver is None:
        raise Exception('You have to provide a web driver')
        
    headers = ['streamer_name', 'hours_streamed', 'average_viewers', 
               'peak_viewers', 'hours_watched', 'followers_gained', 
               'followers_per_hour', 'viewers_gained', 'active_days']
    
    url = 'https://twitchtracker.com/'
    result = []
    
    for name in names:
        print('extracting: ' + name)
        try:
            driver.get(url + name)
            month_button = driver.find_element_by_xpath('//*[@id="select-performance"]/span[2]')
            month_button.click()
        except:
            continue
        html = driver.execute_script('return document.body.innerHTML;')
        soup = bs(html, 'lxml')
        
        result_temp = []
        result_temp.append([name])
        for i in range(len(headers)):
            temp = []
            if i < 7:
                data = soup.find_all('div', {'id': 'performance-panel'})[0].find_all('div', {'class': 'g-x-s-value g-x-s-contrast'})[i].find_all('span')[0].text
                temp.append(data)
            elif i == 7:
                active_days = soup.find_all('div', {'id': 'performance-panel'})[0].find_all('div', {'class': 'g-x-s-value g-x-s-contrast'})[i].find_all('span')[0].text.split('/')[0]
                temp.append(active_days)
            if len(temp) >= 1:
                result_temp.append(temp)
        result.append(result_temp)
        time.sleep(3)
    df = pd.DataFrame(np.array(result).reshape(len(result), 9), columns=headers)
    return df

In [None]:
path = r'./data/top_streamers.csv'
#df = get_top_n_most_subscribed_streamers(driver=chrome, n=50)
#save_df_as_csv(df)
df = load_csv(path)
streamer_names = np.array(df['streamer_name'])
hui = get_streamer_monthly_info(chrome, streamer_names)

In [12]:
hui.to_csv('./data/hui.csv', index=False)

In [None]:
hui

In [14]:
def get_streamer_weekly_info(driver=None, names=''):
    if driver is None:
        raise Exception('You have to provide a web driver')
        
    headers = ['streamer_name', 'hours_streamed', 'average_viewers', 
               'peak_viewers', 'hours_watched', 'followers_gained', 
               'followers_per_hour', 'viewers_gained', 'active_days']
    
    url = 'https://twitchtracker.com/'
    result = []
    
    for name in names:
        print('extracting: ' + name)
        try:
            driver.get(url + name)
            week_button = driver.find_element_by_xpath('//*[@id="select-performance"]/span[1]')
            week_button.click()
        except:
            continue
        html = driver.execute_script('return document.body.innerHTML;')
        soup = bs(html, 'lxml')
        
        result_temp = []
        result_temp.append([name])
        for i in range(len(headers)):
            try:
                temp = []
                if i < 7:
                    data = soup.find_all('div', {'id': 'performance-panel'})[0].find_all('div', {'class': 'g-x-s-value g-x-s-contrast'})[i].find_all('span')[0].text
                    temp.append(data)
                elif i == 7:
                    active_days = soup.find_all('div', {'id': 'performance-panel'})[0].find_all('div', {'class': 'g-x-s-value g-x-s-contrast'})[i].find_all('span')[0].text.split('/')[0]
                    temp.append(active_days)
                if len(temp) >= 1:
                    result_temp.append(temp)
            except:
                continue
        result.append(result_temp)
        time.sleep(3)
    df = pd.DataFrame(np.array(result).reshape(len(result), 9), columns=headers)
    return df

In [None]:
kur = get_streamer_weekly_info(chrome, streamer_names)

In [None]:
kur.tail()

In [None]:
kur.to_csv('./data/weekly_kur.csv', index=False)