In [1]:
import os
import pandas as pd
import datetime as dt
import requests
from bs4 import BeautifulSoup as bs
import json
import time

from selenium import webdriver
from selenium.common.exceptions import TimeoutException

In [109]:
def get_main_html_source(show_alias):

    site = 'https://www.instagram.com/{0}/channel/?hl=en'.format(show_alias)
    
    browser = webdriver.Chrome(executable_path=r"C:\Users\Fang\Desktop\Python Trading\Trading\chromedriver.exe")

    browser.get(site)

    source_list = []

    source_list.append(browser.page_source)

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")

    while True:

        curr_page_source = browser.page_source
        source_list.append(curr_page_source)

        # Scroll down to bottom
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    browser.quit()
    
    return source_list

def ig_pull_init_function(source):
    curr_post_divs = bs(source, 'lxml').find('main').find_all("a")

    post_init_info = {'post_title': [],
                      'post_url': [],
                      'duration': []}

    for a in curr_post_divs:
        href = a['href'] 
        text = a.text
        if '/tv/' in href:
            try: 
                post_init_info['post_url'].append('https://www.instagram.com{}'.format(href))
                post_init_info['post_title'].append(a.find('div', attrs={'class': 'pu1E0'}).text)
                post_init_info['duration'].append(a.find('div', attrs={'class': 'zncDM'}).text)
            except:
                post_init_info['post_url'].append('https://www.instagram.com{}'.format(href))
                post_init_info['post_title'].append('N/A')
                post_init_info['duration'].append('N/A')

    return pd.DataFrame(post_init_info)
    
def get_post_info(curr_url):

    curr_post = bs(requests.get(curr_url).text, 'lxml')

    curr_post_json = json.loads(curr_post.find('script', type = 'application/ld+json').text.strip())

    alternate_name = curr_post_json['author']['alternateName']
    commentCount = curr_post_json['commentCount']
    
    try:
        caption = curr_post_json['caption']
    except:
        caption = ''
        
    likes = curr_post_json['description'].split(' Likes, ')[0].replace(',', '')
    views = curr_post_json['interactionStatistic']['userInteractionCount']
    uploadDate = curr_post_json['uploadDate']

    curr_post_df = pd.DataFrame({'alternate_name': alternate_name,
                                 'commentCount': commentCount,
                                 'caption': caption,
                                 'likes': likes,
                                 'views': views,
                                 'uploadDate': uploadDate}, index = [0])
    return curr_post_df

def get_post_info_alternate(curr_url, show_alias): 

    curr_post = bs(requests.get(curr_url).text, 'lxml')

    raw_dict_string = str(curr_post.find_all('body')[0].find('script')).split('_sharedData = ')[1].replace(';</script>','')
    raw_json = json.loads(raw_dict_string)
    post_json = raw_json['entry_data']['PostPage'][0]['graphql']['shortcode_media']

    curr_post_df = pd.DataFrame({'alternate_name': show_alias,
                                 'commentCount': post_json['edge_media_preview_comment']['count'],
                                 'caption': post_json['edge_media_to_caption']['edges'][0]['node']['text'],
                                 'likes': post_json['edge_media_preview_like']['count'],
                                 'views': post_json['video_view_count'],
                                 'uploadDate': 'N/A'}, index = [0])
    
    return curr_post_df

def get_show_info(show_alias):

    source_list = get_main_html_source(show_alias)

    source_post_links_df = pd.concat([ig_pull_init_function(source) for source in source_list], axis = 0).drop_duplicates().reset_index(drop = True)

    show_df = []

    for idx, row in source_post_links_df.iterrows():
        duration = row.duration
        title = row.post_title

        curr_url = row['post_url']
        
        try:
            curr_post_df = get_post_info(curr_url)
        except:
            try:
                curr_post_df = get_post_info_alternate(curr_url, show_alias)
            except:
                curr_post_df = pd.DataFrame({'alternate_name': show_alias,
                                             'commentCount': 'N/A',
                                             'caption': 'N/A',
                                             'likes': 'N/A',
                                             'views': 'N/A',
                                             'uploadDate': 'N/A'}, index = [0])

        curr_post_df['duration'] = duration
        curr_post_df['title'] = title
        curr_post_df['post_url'] = curr_url

        show_df.append(curr_post_df)

        print("Current Post: {0} --- Completed {1}%".format(title, round(idx/len(source_post_links_df)*100,2)))
        time.sleep(3)

    show_df = pd.concat(show_df, axis = 0).reset_index(drop = True)
    
    return show_df


In [113]:
##################### ADD SHOW NAMES TO LIST HERE #########################

show_aliases = ['fallontonight', 'agt', 'nbcsongland', 'ninjawarrior', 'nbcbluffcitylaw', 'nbcbringthefunny', 
                'nbccouncilofdads', 'nbcindebted', 'nbclincoln','nbcperfectharmony','nbcsongland','nbcsunnyside',
                'nbctheinbetween',
               'nbckenanshow','zoeysplaylist','nbcsnl','nbcbrooklyn99','nbcworldofdance',
               'nbcthisisus','nbcthegoodplace','nbcsvu','latenightseth','nbcsuperstore']

#################### LOOP TO CREATE DATAFRAME OF ALL DATA ##############################

all_shows_df = []

failed_list = []

for i, show in enumerate(show_aliases):
    
    try:
        curr_show_data_df = get_show_info(show)
        all_shows_df.append(curr_show_data_df)
    except:
        failed_list.append(show)
    print("Current Show: {0} --- Completed {1}%".format(show, round(i/len(show_aliases)*100,2)))
    time.sleep(5)

print("Completed 100%")

all_shows_df = pd.concat(all_shows_df, axis = 0).reset_index(drop = True)

currdate = dt.datetime.today().strftime('%Y-%m-%d')

################### SAVE TO CSV ############################
all_shows_df.to_csv('igtv_extract_{}.csv'.format(currdate))

Current Post: Ron Burgundy (Will Ferrell) has beef with Shawn Mendes --- Completed 0.0%
Current Post: Jimmy and @queereye’s Fab Five ride The BEAST --- Completed 1.96%
Current Post: Jimmy & @postmalone go head-to-head in Beer Pong --- Completed 3.92%
Current Post: Cue Card Cold Read w/ @juliannemoore --- Completed 5.88%
Current Post: Jimmy addresses the El Paso and Dayton tragedies --- Completed 7.84%
Current Post: We Test How Well @QueerEye’s Fab Five Know Each Other --- Completed 9.8%
Current Post: #SummerSongs with @camila_cabello --- Completed 11.76%
Current Post: Summer Science with Resident Science Expert Kevin Delaney --- Completed 13.73%
Current Post: Wheel of Opinions with @davidspade --- Completed 15.69%
Current Post: Mad Lib Theater With @Kenanthompson And @Joemanganiello --- Completed 17.65%
Current Post: Jimmy and @stephenathome Perform the “NeverEnding Story” Duet --- Completed 19.61%
Current Post: @jackblack gives us a taste of the Sax-A-Boom --- Completed 21.57%
Current

Current Post: Second chances can change lives. --- Completed 47.06%
Current Post: Nine dogs. Four humans. One wild ride. #AGT is all-new TUESDAY 8/7c on @NBC --- Completed 48.24%
Current Post: Nine dogs. Four humans. One wild ride. #AGT is all-new TONIGHT 8/7c on @NBC --- Completed 49.41%
Current Post: @tylerbutlerfigueroaviolinist, earned his place at the live shows! ⭐️ #AGT --- Completed 50.59%
Current Post: Our grandpas are stronger than your grandpas. 💪👨‍🦳 #AGT --- Completed 51.76%
Current Post: @DomjChambers: voted best person to go to the bar with. 🍻 --- Completed 52.94%
Current Post: @charlottesummersofficial put a spell on us! 😍 #AGT --- Completed 54.12%
Current Post: @itsjosephallen had the audience on their feet right from the start! 🔥#AGT --- Completed 55.29%
Current Post: @LBKidsOfficial lit up the stage! 🔥🔥 #AGT --- Completed 56.47%
Current Post: @beniciobryant hit the judges with raw talent and authenticity. 👏 #AGT --- Completed 57.65%
Current Post: Coconuts: ✅ Blindfold:

Current Post: These Ninjas are straight fire 🔥. #AmericanNinjaWarrior --- Completed 52.78%
Current Post: Make way for the KING!🤴#AmericanNinjaWarrior --- Completed 54.17%
Current Post: @sparklyninja is BACK and better than ever! 💪 #AmericanNinjaWarrior --- Completed 55.56%
Current Post: @tmortimer08 is one of the many reasons why we celebrate our dads today. --- Completed 56.94%
Current Post: FEAR. THE. BEARD #AmericanNinjaWarrior --- Completed 58.33%
Current Post: We're constantly impressed with the endurance these Ninjas bring. --- Completed 59.72%
Current Post: He's called the @reallifeninja for a reason. 👏 #AmericanNinjaWarrior --- Completed 61.11%
Current Post: @ninjajessclay is proof that anything is possible. #AmericanNinjaWarrior --- Completed 62.5%
Current Post: May the best pet win! --- Completed 63.89%
Current Post: @imalexweber is LITERALLY crashing the course! Go Alex go! --- Completed 65.28%
Current Post: @TheChadFlexington, @mrsuccessfu1, and @burkinator500 blew us away!

Current Post: LEAKED: World of Dance: Embodiment Qualifiers Performance --- Completed 93.33%
Current Post: World of Dance: Eva Igo Qualifiers Performance --- Completed 96.67%
Current Show: nbcworldofdance --- Completed 73.91%
Current Show: nbcthisisus --- Completed 78.26%
Current Post: The Good Place SDCC Panel 2019 --- Completed 0.0%
Current Post: The Good Place Season 3 Gag Reel --- Completed 8.33%
Current Post: Jason: Your Sweetest, Dumbest Friend --- Completed 16.67%
Current Post: All of Eleanor's Realizations --- Completed 25.0%
Current Post: #TheGoodPlace is the feel good moments we experienced along the way. ❤️ --- Completed 33.33%
Current Post: Tahani's Humblebrags --- Completed 41.67%
Current Post: Our Favorite Ride-or-Die Couple --- Completed 50.0%
Current Post: Eleanor and Chidi: A Love Story --- Completed 58.33%
Current Post: Janet's Finest Moments - The Good Place (Mashup) --- Completed 66.67%
Current Post: Behind The Scenes Of “Janet(s)” --- Completed 75.0%
Current Post: 

In [2]:
show_aliases = ['fallontonight', 'agt', 'nbcsongland', 'ninjawarrior', 'nbcbluffcitylaw', 'nbcbringthefunny', 
                'nbccouncilofdads', 'nbcindebted', 'nbclincoln','nbcperfectharmony','nbcsongland','nbcsunnyside',
                'nbctheinbetween',
               'nbckenanshow','zoeysplaylist','nbcsnl','nbcbrooklyn99','nbcworldofdance',
               'nbcthisisus','nbcthegoodplace','nbcsvu','latenightseth','nbcsuperstore']

In [4]:
pd.DataFrame(index = show_aliases).to_csv('shows_list.csv')