In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
import pandas as pd
import os
import numpy as np
import re
import time
import nltk
import requests
import random
import math
import pathlib 
from fake_useragent import UserAgent
from piapy import PiaVpn

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q=%22Australian+Coastal+and+Marine+Law%22&oq=")


In [3]:
# Load the dataset from the input directory
google_scholar_df = pd.read_excel(input_path / "google_scholar_paper_list.xlsx", sheet_name='Sheet1')

In [4]:
# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_google_scholar_cites_df.xlsx"
if df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(google_scholar_df, df_existing_data[["ID", "Title"]], how = "outer", left_on = ["ID", "Title"], right_on = ["ID", "Title"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = google_scholar_df
    append_df = pd.DataFrame().reindex(columns=list(google_scholar_df.columns) + ["google scholar cite count"])


data.replace(np.nan, '', regex=True, inplace = True)
data["google scholar cite count"] = np.nan
data = data[data["ID"] != ""]
data.reset_index(drop=True, inplace = True)
print(data.head())
print(append_df.head())

Data already exists. Names that have already been scraped will be skipped
  google1    FirstName  LastName  \
0       1      Richard   Delgado   
1       1       Thomas     Healy   
2       1      Bernard  Harcourt   
3       1        James    Gathii   
4       1  Christopher    Serkin   

                                               Title  \
0  The Current Landscape of Race: Old Targets, Ne...   
1                   Return of the Campus Speech Wars   
2  Reflecting on the Subject: A Critique of the S...   
3  Neoliberalism, Colonialism and International G...   
4  Passive Takings: The State's Affirmative Duty ...   

                               google scholar search    ID PaperType  \
0  Delgado The Current Landscape of Race: Old Tar...    59     notes   
1             Healy Return of the Campus Speech Wars  1118   reviews   
2  Harcourt Reflecting on the Subject: A Critique...   104   article   
3  Gathii Neoliberalism, Colonialism and Internat...    90   reviews   
4  Serkin Pa

In [5]:
def search_google_scholar(search_text, driver, chrome_binary_path, selenium_driver_path):    
    ua = UserAgent()
    userAgent = ua.random
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
    
    # Look for the number of results pannel
    element = []
    while not element:
        try:
            driver.get(f"https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q={search_text}")
            # If page contains "did not match any articles", move to the next name. If we hit the automated queries page, quit.
            page_text = driver.find_element_by_tag_name('body')
            if "did not match any articles" in page_text.text:
                return np.nan, driver
            elif "but your computer or network may be sending automated queries. To protect our users, we can't process" in page_text.text:
                driver.quit()
                driver = create_browser(chrome_binary_path, selenium_driver_path)
            element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[3]/a[3]')
        except NoSuchElementException:
            try:
                element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[2]/a[3]')
            except NoSuchElementException:
                print('Page has not loaded, switching user agent and VPN')
                # Switch User Agent to new random value
                ua = UserAgent()
                userAgent = ua.random
                driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
                # Switch VPN to new random value
                vpn = PiaVpn()
                vpn.set_region(server='random')
                vpn.connect(verbose=True, timeout=20)
                time.sleep(3)

    results_count_text = element.text

    # This only matches integers up to 999,999, but I doubt that will be a problem.
    match = re.search(r"^Cited by (0|[1-9]\d{0,2},?\d*)", results_count_text)
    if match == None:
        result_count = 0
    else:
        result_count = match.group(1)
    
    
    time.sleep(5*np.random.random()+5) 
    return result_count, driver


In [16]:
# Search for each string and save the results to the dataset
data = data[data["google1"] == 1]
data["google scholar cite count"] = np.nan
# data["google scholar cite count"] = data.apply(lambda x: search_google_scholar(x["google scholar search"], driver), axis = 1)

for i in range(len(data)):
    # Read the current row of the df into a dictionary
    dict = data.to_dict('records')[i]
    # Update the cite count variable
    dict["google scholar cite count"], driver = search_google_scholar(data.iloc[i, 4], driver, chrome_binary_path, selenium_driver_full_path)   
    append_df = append_df.append(dict, ignore_index = True)
    append_df.to_excel(intr_path / "_google_scholar_cites_df.xlsx", index = False)

Page has not loaded, switching user agent and VPN
VPN connected to: "switzerland"
Page has not loaded, switching user agent and VPN
VPN connected to: "cambodia"
Page has not loaded, switching user agent and VPN
VPN connected to: "bahamas"
Page has not loaded, switching user agent and VPN
VPN connected to: "au-perth"
Page has not loaded, switching user agent and VPN
VPN connected to: "de-frankfurt"
Page has not loaded, switching user agent and VPN
VPN connected to: "us-denver"


MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=50400): Max retries exceeded with url: /session/292821c4eed26cc0fac695f1c127cc71/goog/cdp/execute (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002393A4C86D0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))