In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
import pandas as pd
import os
import numpy as np
import re
import time
import nltk
import requests
import random
import math
import pathlib 
from fake_useragent import UserAgent
from piapy import PiaVpn
from fuzzywuzzy import fuzz

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait

In [20]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q=%22Australian+Coastal+and+Marine+Law%22&oq=")


In [21]:
# Load the dataset from the input directory
google_scholar_df = pd.read_excel(input_path / "google_scholar_paper_list.xlsx", sheet_name='Sheet1')

In [25]:
# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_google_scholar_cites_df.xlsx"
if df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(google_scholar_df, df_existing_data[["ID", "Title", "google1"]], how = "outer", left_on = ["ID", "Title", "google1"], right_on = ["ID", "Title", "google1"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = google_scholar_df
    append_df = pd.DataFrame().reindex(columns=list(google_scholar_df.columns) + ["google scholar cite count"])

data = data[data["google1"] == 2]
data = data[data["ID"] != ""]
data.replace(np.nan, '', regex=True, inplace = True)
data["google scholar cite count"] = np.nan
data["google scholar article name"] = ""
data.reset_index(drop=True, inplace = True)
data

Data already exists. Names that have already been scraped will be skipped


Unnamed: 0,google1,FirstName,LastName,Title,google scholar search,ID,PaperType,Authors,NumCoauthors,BBCite,...,Journal,VolFirst,Year,Pages,Lateral,NumPages,BookCites,LateralYear,google scholar cite count,google scholar article name
0,2,Colleen,Murphy,"#MeToo, Time's up, and Theories of Justice","Murphy #MeToo, Time's up, and Theories of Justice",1186,article,"Wexler, Lesley (Cited 216 times); Robbennolt, ...",3,2019 U. Ill. L. Rev. 45 (2019),...,University of Illinois Law Review,2019.0,2019,45-110,0,66,,,,
1,2,Gregory,Shaffer,China's Rise: How It Took on the U.S. at the WTO,Shaffer China's Rise: How It Took on the U.S. ...,242,article,"Shaffer, Gregory (Cited 964 times); Gao, Henry...",2,2018 U. Ill. L. Rev. 115 (2018),...,University of Illinois Law Review,2018.0,2018,115-184,1,70,,2014,,
2,2,Robin,Wilson,Getting the Government out of Marriage Post Ob...,Wilson Getting the Government out of Marriage ...,278,article,"Wilson, Robin Fretwell (Cited 614 times)",1,2016 U. Ill. L. Rev. 1445 (2016),...,University of Illinois Law Review,2016.0,2016,1445-1504,1,60,,2013,,
3,2,Bernard,Harcourt,Measured Interpretation: Introducing the Metho...,Harcourt Measured Interpretation: Introducing ...,104,article,"Harcourt, Bernard E. (Cited 962 times)",1,2002 U. Ill. L. Rev. 979 (2002),...,University of Illinois Law Review,2002.0,2002,979-1018,1,40,,2014,,
4,2,J.,Oldham,ALI Principles of Family Dissolution: Some Com...,Oldham ALI Principles of Family Dissolution: S...,1193,article,"Oldham, J. Thomas (Cited 607 times)",1,1997 U. Ill. L. Rev. 801 (1997),...,University of Illinois Law Review,1997.0,1997,801-832,0,32,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350,2,James,Liebman,Experimentalist Equal Protection,Liebman Experimentalist Equal Protection,1161,article,"Garrett, Brandon L. (Cited 1964 times); Liebma...",2,22 Yale L. & Pol'y Rev. 261 (2004),...,Yale Law and Policy Review,22.0,2004,261-328,0,68,,,,
1351,2,Steve,Vladeck,The Detention Power,Vladeck The Detention Power,274,notes,"Vladeck, Stephen I. (Cited 1031 times)",1,22 Yale L. & Pol'y Rev. 153 (2004),...,Yale Law and Policy Review,22.0,2004,153-196,1,44,,2015,,
1352,2,Bruce,Price,From Downhill to Slalom: An Empirical Analysis...,Price From Downhill to Slalom: An Empirical An...,1201,article,"Price, Bruce M. (Cited 38 times); Dalton, Terr...",2,26 Yale L. & Pol'y Rev. 135 (2007-2008),...,Yale Law and Policy Review,26.0,2007,135-208,0,74,,,,
1353,2,Hillary,Greene,Undead Laws: The Use of Historically Unenforce...,Greene Undead Laws: The Use of Historically Un...,1110,notes,"Greene, Hillary (Cited 135 times)",1,16 Yale L. & Pol'y Rev. 169 (1997-1998),...,Yale Law and Policy Review,16.0,1997,169-194,0,26,,,,


In [28]:
def search_google_scholar(search_text, driver, chrome_binary_path, selenium_driver_path):    
    ua = UserAgent()
    userAgent = ua.random
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
    
    # Look for the number of results pannel
    element = []
    while not element:
        try:
            driver.get(f"https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q={search_text}")
            # If page contains "did not match any articles", move to the next name. If we hit the automated queries page, quit.
            page_text = driver.find_element_by_tag_name('body')
            if "did not match any articles" in page_text.text:
                return np.nan, "", driver
            elif "but your computer or network may be sending automated queries. To protect our users, we can't process" in page_text.text:
                driver.quit()
                driver = create_browser(chrome_binary_path, selenium_driver_path)
            element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[3]/a[3]')
        except NoSuchElementException:
            try:
                element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[2]/a[3]')
            except NoSuchElementException:
                print('Page has not loaded, switching user agent and VPN')
                # Switch User Agent to new random value
                ua = UserAgent()
                userAgent = ua.random
                driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
                # Switch VPN to new random value
                vpn = PiaVpn()
                vpn.set_region(server='random')
                vpn.connect(verbose=True, timeout=20)
                time.sleep(3)

    results_count_text = element.text

    # This only matches integers up to 999,999, but I doubt that will be a problem.
    match = re.search(r"^Cited by (0|[1-9]\d{0,2},?\d*)", results_count_text)
    if match == None:
        result_count = 0
    else:
        result_count = match.group(1)
    
    # Get the article name
    try:
        element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[1]/div/h3')
        article_name = element.text
    except NoSuchElementException:
        article_name = ""
        
    time.sleep(5*np.random.random() + 5) 
    return result_count, article_name, driver


In [29]:
for i in range(len(data)):
    # Read the current row of the df into a dictionary
    dict = data.to_dict('records')[i]
    # Update the cite count variable
    dict["google scholar cite count"], dict["google scholar article name"], driver = search_google_scholar(data.iloc[i, 4], driver, chrome_binary_path, selenium_driver_full_path)   
    append_df = append_df.append(dict, ignore_index = True)
    append_df.to_excel(intr_path / "_google_scholar_cites_df.xlsx", index = False)
    groupd_df = append_df[append_df["google1"] == 2]
    groupd_df.to_excel(intr_path / "_google_scholar_cites_group_2.xlsx", index = False)

Page has not loaded, switching user agent and VPN
VPN connected to: "kazakhstan"
Page has not loaded, switching user agent and VPN
VPN connected to: "us-new-york"
Page has not loaded, switching user agent and VPN
VPN connected to: "iceland"
Page has not loaded, switching user agent and VPN
VPN connected to: "monaco"
Page has not loaded, switching user agent and VPN
VPN connected to: "uk-streaming-optimized"
Page has not loaded, switching user agent and VPN
VPN connected to: "estonia"
Page has not loaded, switching user agent and VPN
VPN connected to: "luxembourg"
Page has not loaded, switching user agent and VPN
VPN connected to: "morocco"
Page has not loaded, switching user agent and VPN
VPN connected to: "jp-tokyo"
Page has not loaded, switching user agent and VPN
VPN connected to: "uk-southampton"
Page has not loaded, switching user agent and VPN
VPN connected to: "greenland"
Page has not loaded, switching user agent and VPN
VPN connected to: "romania"
Page has not loaded, switching

In [51]:
groupd_df.replace(np.nan, '', regex=True, inplace = True)
groupd_df["Name Dist Ratio"] = groupd_df.apply(lambda x: fuzz.ratio(x["Title"].lower(), x["google scholar article name"].lower()), axis = 1)
groupd_df = groupd_df[['google1', 'FirstName', 'LastName', 'Title', 'google scholar article name', 'Name Dist Ratio', 'google scholar search', 'ID', 'PaperType', 'Authors', 'NumCoauthors', 'BBCite', 'BBCiteYear', 'BBCiteYearFirst', 'ArticleCites', 'CaseCites', 'Accessed', 'Journal','VolFirst', 'Year', 'Pages', 'Lateral', 'NumPages', 'BookCites', 'LateralYear', 'google scholar cite count']]
groupd_df.to_excel(intr_path / "_google_scholar_cites_group_2.xlsx", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groupd_df["Name Dist Ratio"] = groupd_df.apply(lambda x: fuzz.ratio(x["Title"], x["google scholar article name"]), axis = 1)
