<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Initialization" data-toc-modified-id="Initialization-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Initialization</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Specify-Global-Variables" data-toc-modified-id="Specify-Global-Variables-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Specify Global Variables</a></span></li><li><span><a href="#Functions-and-Classes" data-toc-modified-id="Functions-and-Classes-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Functions and Classes</a></span></li><li><span><a href="#System-dependent-Configuration" data-toc-modified-id="System-dependent-Configuration-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>System-dependent Configuration</a></span></li></ul></li><li><span><a href="#Data-Import-and-Preprocessing" data-toc-modified-id="Data-Import-and-Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data Import and Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Scrape-Twitter-Replies" data-toc-modified-id="Scrape-Twitter-Replies-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Scrape Twitter Replies</a></span></li></ul></li></ul></div>

# Introduction

<p> This playbook scrapes Twitter replies for a given post. </p>

# Initialization


<p> The imports, function and class defintions, global variables, and system-dependent configuration are in this section. </p>

<p> The system dependent configuration should be carefully reviewed and configured for each system (e.g., Linux vs. Windows, or the path of an external program) since the playbook will most likely fail without proper configuration. </p>

## Imports

In [1]:
### This cell imports necessary Python modules and performs initial configuration

### Data manipulation libraries
# import json
import pandas as pd 
import csv

### Visualization and Interaction
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')

from IPython.display import set_matplotlib_formats, display, clear_output, HTML
set_matplotlib_formats('retina')

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import VBox, HBox, Button, HTML

### Computation libraries 
import numpy as np
import re
import random

### Graph analysis
# import networkx as nx
# import community

### System related
# import sys
# import warnings;
# warnings.filterwarnings('ignore')

import io
# from joblib import Parallel, delayed

### Datetime libraries
from datetime import datetime
import time
from pytz import timezone

### NLP dependencies
# import spacy
# from spacy.tokenizer import Tokenizer
# nlp = spacy.load('en')
# tokenizer = Tokenizer(nlp.vocab)

# from langdetect import detect

### Scraping libraries
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup

### Machine learning libraries
# from sklearn import datasets
# from sklearn import linear_model
# from sklearn.feature_selection import f_regression, mutual_info_regression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report

### Logging
import logging 
logging.basicConfig(level=logging.INFO)

#import spacy
# nlp = spacy.load('en')

## Specify Global Variables

In [2]:
### This cell defines global variables and parameters used throughout the playbook

# Make this True if you want to watch Selenium scrape pages
WATCH_SCRAPING = True

MAX_SCROLLS = 100

RAW_DATA_DIRECTORY = "../data/raw/"

## Functions and Classes

In [3]:
### This cell defines functions and classes used throughout the playbook

## System-dependent Configuration

In [4]:
### This cell defines system-dependent configuration such as those different in Linux vs. Windows

# Assuming a particular directory structure and a Linux-based system
# As of Sep 2, 2019, the chromedriver is version 76.X
EXECUTABLE_PATH = "../WebDriver/chromedriver"

COLUMNS = ["commenter_id", "comment_text"]

# Data Import and Preprocessing


In [5]:
### Instagram hashtag or user to be scraped is entered in this step

# Create the driver
chrome_options = webdriver.ChromeOptions()
if not WATCH_SCRAPING:
    chrome_options.add_argument('--headless')
chrome_options.add_argument('--incognito')

try:
    driver = webdriver.Chrome(options=chrome_options, executable_path=EXECUTABLE_PATH)
    logging.info("Chrome launched")
except:
    logging.critical("Chrome could not be launched. Check if EXECUTABLE_PATH is configured correcely. If it is, check if the Chromedriver supports the version of the browser.")
    

INFO:root:Chrome launched


## Scrape Twitter Replies

In [6]:
"""
This cell retrieves page posts and comments, for a given page.
"""
text = widgets.Text(description="Twitter link (whole link): ", width=200)
button = widgets.Button(description="Retrieve")
fb_selection = HBox([text, button])
display(fb_selection)

out = widgets.Output()
display(out)
comment_array_of_arrays = []
comment_array = []
text_of_page_arrays = []

def on_button_clicked(b):
    global soup, comment_row, tweet_set, tweet_list, tweet_dict
    tweet_list = []
    tweet_set = set()
    tweet_dict = dict()
    with out:
        clear_output()
        initial_page = text.value
        print("Retrieving posts and comments from " + str(initial_page))
        try:
            driver.get(initial_page)
            time.sleep(1)
            scrolls = 0
            
            # conditions to check whether there are more tweets
            prev_len = 0 
            more_tweets = True
            # menu = driver.find_element_by_xpath("//div[@data-testid='primaryColumn']")
            # ActionChains(driver).move_to_element(menu).click()
            while scrolls < MAX_SCROLLS and more_tweets:
                # ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
                driver.execute_script("window.scrollTo(0, " + str(scrolls * 500) + ");")
                time.sleep(1)
                scrolls += 1
                # Britany's adds open the comments 
                
                try:
                    chrome_options2 = webdriver.ChromeOptions()
                    chrome_options2.add_argument('--headless')
                    chrome_options2.add_argument('--incognito')
                    driver2 = webdriver.Chrome(options=chrome_options2, executable_path=EXECUTABLE_PATH)
                    driver2.quit()  
                    time.sleep(1)
                    print("Starting parsing")
                    # TODO: Clicking takes the page to the top - need to find the right element to click
                    tw_timeline = driver.find_element_by_xpath('//div[@aria-label="Timeline: Conversation"]')
                    
                    # Click button
                    try:
                        all_buttons = driver.find_element_by_xpath('//div[@aria-label="Timeline: Conversation"]').find_elements_by_xpath('//div[@role="button"]')
                        for a_button in all_buttons:
                            text_array = BeautifulSoup(a_button.get_attribute("innerHTML")).find_all(text=True, recursive=True)
                            
                            concat_text = ""
                            
                            for ta in text_array:
                                concat_text += " " + ta
                                
                            if "Show" in concat_text:
                                a_button.click()
                                
                    except:
                        print("Button not found...")
                        
                    # print(tw_timeline.get_attribute("innerHTML"))
                    tw_timeline = (tw_timeline.get_attribute("innerHTML"))            
                    #print(tw_timeline)
                    # tweet_list.extend(tw_elements)
                    # print("array", len(tweet_list))
                    tw_elements = BeautifulSoup(tw_timeline).find("div").find_all("article", recursive=True)
                    for twe in tw_elements:
                        tweet_set = tweet_set.union(twe)
                        # tweet_dict.update({str(twe): None})
                        tweet_list.append(twe)
                        
                    if scrolls % 4 == 3:
                        if len(tweet_set) > prev_len:
                            prev_len = len(tweet_set)                                
                        else:
                            more_tweets = False

                            
                    #print("current set", len(tweet_set))
                    #print("current list", len(tweet_list))
                    print(scrolls)
                    print("---")

                    # driver.find_element_by_class_name("r-qvutc0").click()            
                except:
                    print("Error in finding the element...")
                    time.sleep(1)
                
        except:
            logging.info("Error retrieving the page. Try again.")
            
        # An interaction with the browser is required to open up the tweets
        # This is the easiest way
        chrome_options2 = webdriver.ChromeOptions()
        chrome_options2.add_argument('--headless')
        chrome_options2.add_argument('--incognito')
        driver2 = webdriver.Chrome(options=chrome_options2, executable_path=EXECUTABLE_PATH)
        driver2.quit()  
                
        twitter_handle = re.compile(
            '''
            (?<=@)
            ([\w\d_]+)       # username
            ''',
            re.UNICODE | re.VERBOSE)
        
        comment_row = []

        for tw_element in tweet_list:
            comment_data = {"commenter_id": None,
                "comment_text": None
            }
            
            soup_tw = tw_element.find("div", class_="r-1mi0q7o", recursive=True)

            # Get user_id
            try:
                username_array = soup_tw.find_all("div", class_="css-1dbjc4n", recursive=False)[0].find_all(text=True, recursive=True)
                concat_text = ""
                for uax in username_array:
                    concat_text += " " + uax
                concat_text = concat_text.replace("\n", " ")  
                comment_data["commenter_id"] = twitter_handle.findall(concat_text)[0]
            except:
                # Not all divs are 
                continue
                
            # Get the text
            try:
                text_array_pre = soup_tw.find_all("div", class_="css-1dbjc4n", recursive=False)[1]
                initial_text = text_array_pre.find_all("div", recursive=False)
                text_array = initial_text[-3].find_all(text=True, recursive=True)
                concat_text = ""
                for tax in text_array:
                    concat_text += " " + tax
                concat_text = concat_text.replace("\n", " ")
                comment_data["comment_text"] = concat_text
            except:
                pass
            #print(soup.find_all("div", class_="css-1dbjc4n", recursive=False)[1].text)
            #print("----")
            comment_row.append(comment_data)
        

        df_comments = pd.DataFrame.from_dict(comment_row)
        
        # Using list: the elements are duplicate
        # using set solves the duplicate issue but it is not ordered
        df_comments = df_comments.drop_duplicates()
        df_comments.to_csv("TWITTER-POST-" + initial_page.split("/")[-1] + "-" + datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + ".csv", index=False, na_rep='None', columns=COLUMNS)
        print("Data scraping finished...")
button.on_click(on_button_clicked)

# Example status Twitter link: https://twitter.com/JZarif/status/1253698754575765515

HBox(children=(Text(value='', description='Twitter link (whole link): '), Button(description='Retrieve', style…

Output()

In [7]:
a=set(["onur", "ena"])

In [8]:
len(a)

2