In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
import pandas as pd
import os
import numpy as np
import re
import time
import nltk
import requests
import random
import math
import pathlib 
from fake_useragent import UserAgent
from piapy import PiaVpn
from fuzzywuzzy import fuzz

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q=%22Australian+Coastal+and+Marine+Law%22&oq=")


In [8]:
# Load the dataset from the input directory
google_scholar_df = pd.read_excel(input_path / "google_scholar_paper_list.xlsx", sheet_name='Sheet1')

In [9]:
# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_google_scholar_cites_df.xlsx"
if df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(google_scholar_df, df_existing_data[["ID", "Title", "google1"]], how = "outer", left_on = ["ID", "Title", "google1"], right_on = ["ID", "Title", "google1"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = google_scholar_df
    append_df = pd.DataFrame().reindex(columns=list(google_scholar_df.columns) + ["google scholar cite count"])

data = data[data["google1"] == 4]
data = data[data["ID"] != ""]
data.replace(np.nan, '', regex=True, inplace = True)
data["google scholar cite count"] = np.nan
data["google scholar article name"] = ""
data.reset_index(drop=True, inplace = True)
data

Data already exists. Names that have already been scraped will be skipped


Unnamed: 0,google1,FirstName,LastName,Title,google scholar search,ID,PaperType,Authors,NumCoauthors,BBCite,...,Journal,VolFirst,Year,Pages,Lateral,NumPages,BookCites,LateralYear,google scholar cite count,google scholar article name
0,4,Richard,Delgado,Critical Race Theory: The Cutting Edge,Delgado Critical Race Theory: The Cutting Edge,59,,,2,,...,,,1995,,1,,514.0,2013.0,,
1,4,Bryant,Garth,Dealing In Virtue: International Commercial Ar...,Garth Dealing In Virtue: International Commerc...,89,,,2,,...,,,1998,,1,,468.0,2012.0,,
2,4,Herbert,Hovenkamp,Enterprise And American Law: 1836-1937,Hovenkamp Enterprise And American Law: 1836-1937,122,,,1,,...,,,1991,,1,,445.0,2017.0,,
3,4,Samuel,Moyn,The Last Utopia: Human Rights In History,Moyn The Last Utopia: Human Rights In History,190,,,1,,...,,,2010,,1,,432.0,2017.0,,
4,4,Mary,Dudziak,Cold War Civil Rights: Race And The Image Of A...,Dudziak Cold War Civil Rights: Race And The Im...,65,,,1,,...,,,2000,,1,,423.0,2012.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,4,Cathleen,Kaveny,Prophecy Without Contempt: Religious Rhetoric ...,Kaveny Prophecy Without Contempt: Religious Rh...,141,,,1,,...,,,2016,,1,,0.0,2014.0,,
193,4,Lolita,Inniss,The Princeton Fugitive Slave: James Collins J...,Inniss The Princeton Fugitive Slave: James Co...,129,,,1,,...,,,2019,,1,,0.0,2017.0,,
194,4,Wendell,Pritchett,Robert Clifton Weaver And The American City: T...,Pritchett Robert Clifton Weaver And The Americ...,217,,,1,,...,,,2008,,1,,0.0,2014.0,,
195,4,Gideon,Yaffe,Manifest Activity: Thomas Reid’S Theory Of Action,Yaffe Manifest Activity: Thomas Reid’S Theory ...,282,,,1,,...,,,2004,,1,,0.0,2012.0,,


In [10]:
def search_google_scholar(search_text, driver, chrome_binary_path, selenium_driver_path):    
    ua = UserAgent()
    userAgent = ua.random
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
    
    # Look for the number of results pannel
    element = []
    while not element:
        try:
            driver.get(f"https://scholar.google.com/scholar?hl=en&as_sdt=5%2C36&sciodt=0%2C36&cites=17432944610365151854&scipsc=&q={search_text}")
            # If page contains "did not match any articles", move to the next name. If we hit the automated queries page, quit.
            page_text = driver.find_element_by_tag_name('body')
            if "did not match any articles" in page_text.text:
                return np.nan, "", driver
            elif "but your computer or network may be sending automated queries. To protect our users, we can't process" in page_text.text:
                driver.quit()
                driver = create_browser(chrome_binary_path, selenium_driver_path)
            element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[3]/a[3]')
        except NoSuchElementException:
            try:
                element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div/div/div[2]/a[3]')
            except NoSuchElementException:
                print('Page has not loaded, switching user agent and VPN')
                # Switch User Agent to new random value
                ua = UserAgent()
                userAgent = ua.random
                driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})
                # Switch VPN to new random value
                vpn = PiaVpn()
                vpn.set_region(server='random')
                vpn.connect(verbose=True, timeout=20)
                time.sleep(3)

    results_count_text = element.text

    # This only matches integers up to 999,999, but I doubt that will be a problem.
    match = re.search(r"^Cited by (0|[1-9]\d{0,2},?\d*)", results_count_text)
    if match == None:
        result_count = 0
    else:
        result_count = match.group(1)
    
    # Get the article name
    try:
        element = driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[1]/div/h3')
        article_name = element.text
    except NoSuchElementException:
        article_name = ""
        
    time.sleep(5*np.random.random() + 5) 
    return result_count, article_name, driver


In [11]:
for i in range(len(data)):
    # Read the current row of the df into a dictionary
    dict = data.to_dict('records')[i]
    # Update the cite count variable
    dict["google scholar cite count"], dict["google scholar article name"], driver = search_google_scholar(data.iloc[i, 4], driver, chrome_binary_path, selenium_driver_full_path)   
    append_df = append_df.append(dict, ignore_index = True)
    append_df.to_excel(intr_path / "_google_scholar_cites_df.xlsx", index = False)
    groupd_df = append_df[append_df["google1"] == 4]
    groupd_df.to_excel(intr_path / "_google_scholar_cites_group_4.xlsx", index = False)

Page has not loaded, switching user agent and VPN
VPN connected to: "mongolia"
Page has not loaded, switching user agent and VPN
VPN connected to: "austria"
Page has not loaded, switching user agent and VPN
VPN connected to: "cambodia"
Page has not loaded, switching user agent and VPN
VPN connected to: "bangladesh"
Page has not loaded, switching user agent and VPN
VPN connected to: "bulgaria"
Page has not loaded, switching user agent and VPN
VPN connected to: "liechtenstein"
Page has not loaded, switching user agent and VPN
VPN connected to: "macedonia"


In [12]:
groupd_df.replace(np.nan, '', regex=True, inplace = True)
groupd_df["Name Dist Ratio"] = groupd_df.apply(lambda x: fuzz.ratio(x["Title"].lower(), x["google scholar article name"].lower()), axis = 1)
groupd_df = groupd_df[['google1', 'FirstName', 'LastName', 'Title', 'google scholar article name', 'Name Dist Ratio', 'google scholar search', 'ID', 'PaperType', 'Authors', 'NumCoauthors', 'BBCite', 'BBCiteYear', 'BBCiteYearFirst', 'ArticleCites', 'CaseCites', 'Accessed', 'Journal','VolFirst', 'Year', 'Pages', 'Lateral', 'NumPages', 'BookCites', 'LateralYear', 'google scholar cite count']]
groupd_df.to_excel(intr_path / "_google_scholar_cites_group_4.xlsx", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groupd_df["Name Dist Ratio"] = groupd_df.apply(lambda x: fuzz.ratio(x["Title"].lower(), x["google scholar article name"].lower()), axis = 1)
