In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import requests
from bs4 import BeautifulSoup
import numpy as np
from scholarly import scholarly
import pandas as pd
from tqdm import tqdm
import time
import random
from fake_useragent import UserAgent

# Building a Citation Network of Finance Journal Articles and their Citations Papers on Google Scholar

## 1. Web Scrapping papers published on the Journal of Financial Economics (JFE)

In [None]:
# URL of the archive of Journal of Financial Economics jounrals
afaurl = "https://www.sciencedirect.com/journal/journal-of-financial-economics/issues"

In [None]:
# Initialize WebDriver
chrome_options = webdriver.ChromeOptions()
#chrome_options.proxy = prox
driver = webdriver.Chrome(options=chrome_options)
driver.get(afaurl)

In [None]:
# Issues for each volume are organized in different drop-down lists. Save the hmtl code for each drop-down list.
li = driver.find_elements(By.XPATH, "//button[@class='u-display-flex icon-right accordion-panel-title u-padding-s-ver u-text-left']")
li_res = li[4:]  # First three links are unrelated

In [None]:
li_res

[<selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_25")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_26")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_27")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_28")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_29")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad13f5f", element="F9A2F421FCA172684460DDC5D14D7083_element_30")>,
 <selenium.webdriver.remote.webelement.WebElement (session="1e9821a4ed0dc23a95776373dad1

In [None]:
# Parse through each drop-down list (i.e. each volume) and collect the links for each issue.

links = []
pub_year = []

for i in li_res:
    i.click()
    link = [j.get_attribute('href') for j in driver.find_elements(By.XPATH,"//div[@class = 'issue-item u-margin-s-bottom']/a")]
    links.append(link)
    pub = [j.text for j in driver.find_elements(By.XPATH,"//div[@class = 'issue-item u-margin-s-bottom']")]
    pub_year.append(pub)
    i.click()

In [None]:
links1 = np.concatenate(links)
pub_year1 = np.concatenate(pub_year)

In [None]:
pub_year1

array(['Volume 152 \nIn progress (February 2024)',
       'Volume 151 \nJanuary 2024', 'Volume 150, Issue 3 \nDecember 2023',
       'Volume 150, Issue 2 \nNovember 2023',
       'Volume 150, Issue 1 \nPages 1-184 (October 2023)',
       'Volume 149, Issue 3 \nPages 349-610 (September 2023)',
       'Volume 149, Issue 2 \nPages 115-348 (August 2023)',
       'Volume 149, Issue 1 \nPages 1-114 (July 2023)',
       'Volume 148, Issue 3 \nPages 175-296 (June 2023)',
       'Volume 148, Issue 2 \nPages 91-174 (May 2023)',
       'Volume 148, Issue 1 \nPages 1-90 (April 2023)',
       'Volume 147, Issue 3 \nPages 475-688 (March 2023)',
       'Volume 147, Issue 2 \nPages 271-474 (February 2023)',
       'Volume 147, Issue 1 \nPages 1-270 (January 2023)',
       'Volume 146, Issue 3 \nPages 821-1170 (December 2022)',
       'Volume 146, Issue 2 \nPages 357-820 (November 2022)',
       'Volume 146, Issue 1 \nPages 1-356 (October 2022)',
       'Volume 145, Issue 3 \nPages 665-1024 (September 

In [None]:
links1[177:]

array(['https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/92/issue/3',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/92/issue/2',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/92/issue/1',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/91/issue/3',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/91/issue/2',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/91/issue/1',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/90/issue/3',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/90/issue/2',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/90/issue/1',
       'https://www.sciencedirect.com/journal/journal-of-financial-economics/vol/89/issue/3',
       'https://www.sciencedirect.com/journal/journal-of-fin

In [None]:
# Collect the publication date of each issue and the publication information of the articles published in the corresponding issue

jfe = []
jfe_date = []

for i in links1:
    driver.get(i)  # Access the issue 
    date = driver.find_elements(By.XPATH,"//h3[@class = 'js-issue-status text-s']")[0].text  # Collect the publication date of the issue
    jfe_date.append(date)
    jfe.append([i.text for i in \
  driver.find_elements(By.XPATH,"//li[@class = 'js-article-list-item article-item u-padding-xs-top u-margin-l-bottom']")])    # Collect the title of the article

In [None]:
# Data cleaning: clean our data into separate columns and combine with the publication date of the corresponding issue
# Columns: title of artile, authors, article number

df = [] 

for i in range(len(jfe)):
    ref = [j.split('\n') for j in jfe[i]]
    df_ = pd.DataFrame(ref)
    df_['Date'] = jfe_date[i]
    df.append(df_)

df_prof = pd.concat(df)

In [None]:
# Resulting dataset includes title of artile, authors, artilce ID, and date of publication. 

df_prof.head(30)

Unnamed: 0,0,1,2,3,4,5,Date
0,select article Delayed crises and slow recoveries,Research articleAbstract only,Delayed crises and slow recoveries,"Xuewen Liu, Pengfei Wang, Zhongchao Yang",Article 103757,Article preview,In progress (February 2024)
1,select article Learning about the consumption ...,Research articleOpen access,Learning about the consumption risk exposure o...,"Yongjin Kim, Lars-Alexander Kuehn, Kai Li",Article 103759,View PDFArticle preview,In progress (February 2024)
2,select article Why did shareholder liability d...,Research articleOpen access,Why did shareholder liability disappear?,"David A. Bogle, Gareth Campbell, Christopher C...",Article 103761,View PDFArticle preview,In progress (February 2024)
3,select article Disagreement about public infor...,Research articleAbstract only,Disagreement about public information quality ...,"Chong Huang, Radhika Lunawat, Qiguang Wang",Article 103762,Article preview,In progress (February 2024)
4,select article Stress tests and model monoculture,Research articleAbstract only,Stress tests and model monoculture,"Keeyoung Rhee, Keshav Dogra",Article 103760,Article preview,In progress (February 2024)
5,select article Independent regulators and fina...,Research articleAbstract only,Independent regulators and financial stability...,"Marco Del Angel, Gary Richardson",Article 103773,Article preview,In progress (February 2024)
6,select article Quantifying the impact of red t...,Research articleAbstract only,Quantifying the impact of red tape on investme...,"Bruno Pellegrino, Geoffery Zheng",Article 103763,Article preview,In progress (February 2024)
0,select article Editorial Board,Full text access,Editorial Board,Article 103767,View PDF,,January 2024
1,select article The use of asset growth in empi...,Research articleAbstract only,The use of asset growth in empirical asset pri...,"Michael Cooper, Huseyin Gulen, Mihai Ion",Article 103746,Article preview,January 2024
2,select article Monetary policy transmission in...,Research articleAbstract only,Monetary policy transmission in segmented markets,"Jens Eisenschmidt, Yiming Ma, Anthony Lee Zhang",Article 103738,Article preview,January 2024


In [None]:
# Further cleaning our data: formatting our data and removing unnecessary letters, punctuations, etc. 

df_save = df_prof[[2,3,'Date']]  # Filter our dataset to necessary columns: title, authors, and publication date

pattern = rpattern = r'\((.*?)\)|(\w+\s+\d{4})'
df_save['Date'] = df_save['Date'].str.extract(pattern).apply(lambda x: ''.join(x.dropna()), axis=1)
df_save.rename(columns={2: 'Title', 3: 'Authors'}, inplace=True)
df_save1 = df_save[(df_save['Title'] != 'Editorial Board') & (df_save['Title'] != 'Index')\
         & (df_save['Title'] != "Publisher's Note")]

df_save1.to_excel('Published_Papers_JFE.xlsx')

In [None]:
df_save1

Unnamed: 0,Title,Authors,Date
0,Delayed crises and slow recoveries,"Xuewen Liu, Pengfei Wang, Zhongchao Yang",February 2024
1,Learning about the consumption risk exposure o...,"Yongjin Kim, Lars-Alexander Kuehn, Kai Li",February 2024
2,Why did shareholder liability disappear?,"David A. Bogle, Gareth Campbell, Christopher C...",February 2024
3,Disagreement about public information quality ...,"Chong Huang, Radhika Lunawat, Qiguang Wang",February 2024
4,Stress tests and model monoculture,"Keeyoung Rhee, Keshav Dogra",February 2024
...,...,...,...
3,The cross-section of expected corporate bond r...,"William R. Gebhardt, Soeren Hvidkjaer, Bhaskar...",January 2005
4,The timing of initial public offerings,"Simon Benninga, Mark Helmantel, Oded Sarig",January 2005
5,The effect of external finance on the equilibr...,"Heitor Almeida, Daniel Wolfenzon",January 2005
6,The “make or take” decision in an electronic m...,"Robert Bloomfield, Maureen O’Hara, Gideon Saar",January 2005


In [None]:
# Save our dataset as an excel file with name "Published_Papers_JFE.xlsx"

df_save1.to_excel('Published_Papers_JFE.xlsx')

## 2. Scrap Google Scholar citing papers of JFE papers

In [None]:
# Read dataset of JFE paper information 

jfe = pd.read_excel("Published_Papers_JFE.xlsx",index_col=0)

### I. Functions

#### a. Title of previous working paper verisons  

In [None]:
# Part 1.1.
# Each paper could have been on Google Scholars as a working paper before being publihsed on the JFE. 
# Different versions of the paper might have slightly different titles. Thus, we want to record all versions of the article title. 
# This function scraps the titles of the previous versions of the an input JFE paper. ***

def working_paper_names(driver):
    try: 
        # Extract data
        data = driver.page_source  # Get the HTML source of the page
        # Close the browser
        soup = BeautifulSoup(data, "html.parser")
        gs_soup_bdy = soup.body.find_all("div",class_="gs_r gs_or gs_scl")
        trial1 = [i.find_all('h3')[0].text.lower() for i in gs_soup_bdy]
        trial2 = [i.split("] ")[-1] for i in trial1]
        unique_wp_name = set(trial2)
    except:   
        # The speed of our program may be faster than the actual page loads which would lead to an error. 
        # To avoid this, we want to retry when we encounter an error, but this time with a wait time for the page to load. 
        
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,'gs_res_ccl_mid')))
        # Extract data
        data = driver.page_source  # Get the HTML source of the page
        # Close the browser
        soup = BeautifulSoup(data, "html.parser")
        gs_soup_bdy = soup.body.find_all("div",class_="gs_r gs_or gs_scl")
        trial1 = [i.find_all('h3')[0].text.lower() for i in gs_soup_bdy]
        trial2 = [i.split("] ")[-1] for i in trial1]
        unique_wp_name = set(trial2)
    return unique_wp_name

In [None]:
# Part 1. Get the titles of the all working paper versions of the input JFE paper.

def wp_for_one_paper(versions_num,driver,title,index):
    if versions_num != None: 

        # CLICK ON VERSIONS LINK AND ACCESS PAGE 
        wpv = driver.find_elements(By.CLASS_NAME,'gs_ri')[index].find_elements(By.PARTIAL_LINK_TEXT,'versions')
        wpv[0].send_keys(Keys.CONTROL + Keys.RETURN)
        driver.switch_to.window(driver.window_handles[1])

        # DISTINGUISH BETWEEN SINGLE AND MULTIPLE PAGES. 
        # A paper might have only one page of historical working paper versions in which case, we don't have to loop through all the different page numbers. 

        # Multiple pages of working paper versions
        if int(versions_num)>10: 
            # NUMBER OF PAGES
            N =int(versions_num)//10
            print("# of pages for working version papers", N)
            ######################
            # LOOP THROUGH PAGES 
            wpv = []
            iterate = True
            for n in range(N+1):
                if iterate == True: 
                    if n != N:
                        wpv.append(working_paper_names(driver))
                        # INCLUDE HUMAN BEHAVIOR WHEN MOVING ONTO THE NEXT PAGE 
                        time.sleep(np.random.uniform(0, 3))
                        # click next and open in new tabe
                        next_pg = driver.find_elements(By.PARTIAL_LINK_TEXT,'Next')
                        # Deal with error: "List out of range"
                        print(next_pg)
                        if len(next_pg) ==0:
                            iterate = False 
                            driver.close()
                            # done and switch to paper page
                            driver.switch_to.window(driver.window_handles[0])
                        else:
                            #print(next_pg[0].text)
                            next_pg[-1].send_keys(Keys.CONTROL + Keys.RETURN)
                            # close current tab
                            driver.close()
                            # switch to new tab
                            driver.switch_to.window(driver.window_handles[1])
                    else:
                        wpv.append(working_paper_names(driver))
                        # WAIT BEFORE CLOSING 
                        time.sleep(np.random.uniform(0, 3))
                        # close current tab
                        driver.close()
                        # done and switch to paper page
                        driver.switch_to.window(driver.window_handles[0])
            WP = set.union(*(wpv))
            ######################

        # Single page of working paper versions (i.e. 10 <=)
        else: 
            print('Only one page')
            WP = working_paper_names(driver)
            # WAIT BEFORE CLOSING 
            time.sleep(np.random.uniform(0, 3))
            # close current tab
            driver.close()
            # done and switch to paper page
            driver.switch_to.window(driver.window_handles[0])

    # IF NO WORKING PAPER VERSION NAME, JUST PUT AS PUBLISHED JF PAPER 
    else:
        WP = title
    return WP

#### b. Google Scholar citing papers 

In [None]:
# Part 2.1.
# Scrap the citation information of one Google Scholar citing paper
def citing_papers1(driver):
    # Extract data
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID,'gs_res_ccl_mid')))
    data = driver.page_source  # Get the HTML source of the page
    # Close the browser
    soup = BeautifulSoup(data, "html.parser")
    gs_soup_bdy = soup.body.find_all("div",class_="gs_r gs_or gs_scl")
    trial1 = []
    for j in [i.find_all('div',class_ = 'gs_a') for i in gs_soup_bdy]:
        if len(j)!=0:
            trial1.append(j[0].text)
        else:
            trial1.append(np.NAN)
    citing_paper_title = [i.find_all('h3')[0].text.lower() for i in gs_soup_bdy]
    ret = pd.DataFrame([citing_paper_title,trial1]).T
    return ret

In [None]:
# Part 2.
# For each JFE paper, loop through all the pages of the citing papers 
# while randomly adding humna-like movements, such as searches, to reduce the amount of bot detection from Goolge's bot detector, Captcha. 
# We can't completely avoid bot detection, so we also allow the program to pause when detected by Captcha and wait for our signal to move on after manually solving the puzzle. 

def cp_for_one_paper(citing_num,driver,title,index,JF_paper_index,low1):
    # CLICK ON VERSIONS LINK AND ACCESS PAGE 
    cp_ = driver.find_elements(By.CLASS_NAME,'gs_ri')[index].find_elements(By.PARTIAL_LINK_TEXT,'Cited by')
    cp_[0].send_keys(Keys.CONTROL + Keys.RETURN)
    driver.switch_to.window(driver.window_handles[1])
    # DISTINGUISH BETWEEN SINGLE AND MULTIPLE PAGES
    if int(citing_num)>10: 
        # NUMBER OF PAGES
        N =int(citing_num)//10
        if N >= 100:
            N = 99
        else:
            if int(citing_num)%10 == 0:
                N = N-1
        print("# of pages for citing papers", N)
        ######################  
        # LOOP THROUGH PAGES 
        cp_pg = []
        iterate = True
        for n in range(N+1):
            if iterate == True: 
                time.sleep(np.random.uniform(0, 2))
                # INCLUDE SOME RANDOM SEARCH MOVEMENT EVERY 5 PAGES, BUT DON"T ACTUALLY SEARCH
                if n%4 == 0:
                    el = driver.find_element(By.ID,'gs_hdr_tsi')
                    el.clear()
                    el.send_keys("Random movement"+ str(np.random.randint(1,5)))
                try:         
                    if n != N:
                        time.sleep(np.random.uniform(0, 2))
                        cap_check(driver)
                        cit_data = citing_papers1(driver)
                        cp_pg.append(cit_data)
                        # INCLUDE HUMAN BEHAVIOR WHEN MOVING ONTO THE NEXT PAGE 
                        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT,'Next')))
                        # click next and open in new tabe
                        next_pg = driver.find_elements(By.LINK_TEXT,'Next')
                        if len(next_pg) ==0:
                            iterate = False 
                            driver.close()
                            # done and switch to paper page
                            driver.switch_to.window(driver.window_handles[0])
                        else:
                            #print(next_pg[0].text)
                            time.sleep(np.random.uniform(0, 2))
                            next_pg[-1].send_keys(Keys.CONTROL + Keys.RETURN)
                            cap_check(driver)
                            # close current tab
                            driver.close()
                            # switch to new tab
                            driver.switch_to.window(driver.window_handles[1])
                    else:
                        cit_data = citing_papers1(driver)
                        cp_pg.append(cit_data)
                        # WAIT BEFORE CLOSING 
                        time.sleep(np.random.uniform(0, 3))
                        # close current tab
                        driver.close()
                        # done and switch to paper page
                        driver.switch_to.window(driver.window_handles[0])
                except Exception as e:
                    print (e)
                    driver.quit()
                    print("Crashed at ", n)
                    # HAVE TO RESTART AT PAGE n, and JF paper t. 
                    ua = UserAgent()
                    userAgent = ua.random
                    userAgent
                    chrome_options = webdriver.ChromeOptions()
                    chrome_options.add_argument(f'user-agent={userAgent}')
                    driver = webdriver.Chrome(options=chrome_options)
                    gs_url = 'https://scholar.google.com'
                    driver.get(gs_url)
                    print('restart. GS page')
                    cap_check(driver)

                    el = driver.find_element(By.ID,'gs_hdr_tsi')
                    el.clear()
                    el.send_keys(low1.iloc[JF_paper_index,0]+' '+low1.iloc[JF_paper_index,2])
                    el.send_keys(Keys.RETURN)
                    print('restart. GS page + paper title')
                    
                    cap_check(driver)
                    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME,'gs_ri')))
                    cp_ = driver.find_elements(By.CLASS_NAME,'gs_ri')[index].find_elements(By.PARTIAL_LINK_TEXT,'Cited by')
                    cp_[0].send_keys(Keys.CONTROL + Keys.RETURN)
                    driver.switch_to.window(driver.window_handles[1])

                    # GO TO PAGE n
                    for s in range(n):
                        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID,'gs_top')))
                        # INCLUDE SOME RANDOM SEARCH MOVEMENT EVERY 5 PAGES, BUT DON"T ACTUALLY SEARCH
                        if s%4 == 0:
                            el = driver.find_element(By.ID,'gs_hdr_tsi')
                            el.clear()
                            el.send_keys("Random movement"+ str(np.random.randint(1,5)))
                        next_pg = driver.find_elements(By.LINK_TEXT,'Next')
                        #print(next_pg[0].text)
                        time.sleep(np.random.uniform(0, 2))
                        next_pg[-1].send_keys(Keys.CONTROL + Keys.RETURN)
                        cap_check(driver)
                        # close current tab
                        driver.close()
                        # switch to new tab
                        driver.switch_to.window(driver.window_handles[1])
                        cap_check(driver)
                    if n != N:
                        cit_data = citing_papers1(driver)
                        cp_pg.append(cit_data)
                        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT,'Next')))
                        # click next and open in new tabe
                        next_pg = driver.find_elements(By.LINK_TEXT,'Next')
                        if len(next_pg) ==0:
                            iterate = False 
                            driver.close()
                            # done and switch to paper page
                            driver.switch_to.window(driver.window_handles[0])
                        else:
                            #print(next_pg[0].text)
                            time.sleep(np.random.uniform(0, 3))
                            next_pg[-1].send_keys(Keys.CONTROL + Keys.RETURN)
                            cap_check(driver)
                            # close current tab
                            driver.close()
                            # switch to new tab
                            driver.switch_to.window(driver.window_handles[1])
                    else:
                        cit_data = citing_papers1(driver)
                        cp_pg.append(cit_data)
                        # WAIT BEFORE CLOSING 
                        time.sleep(np.random.uniform(0, 3))
                        # close current tab
                        driver.close()
                        # done and switch to paper page
                        driver.switch_to.window(driver.window_handles[0])
        cp_pg1 = pd.concat(cp_pg)
        cp_pg1 = cp_pg1.drop_duplicates()
        ######################
    else: 
        print('Only one page')
        n = 1
        cp_pg1 = citing_papers1(driver)
        # WAIT BEFORE CLOSING 
        time.sleep(np.random.uniform(0, 3))
        # close current tab
        driver.close()
        # done and switch to paper page
        driver.switch_to.window(driver.window_handles[0])
    print(len(cp_pg1), int(citing_num) == len(cp_pg1))
    confirm.append(int(citing_num) == len(cp_pg1))
    c = (int(citing_num) == len(cp_pg1))
    cp_pg1['cited_paper'] = title
    return cp_pg1,c

In [None]:
# This function pauses the program when it is detected by Captcha and waits for our sign to move on. 

def cap_check(driver):
    WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.ID,'gs_top')))
    data_JF_paper = driver.page_source
    soup_JF_paper  = BeautifulSoup(data_JF_paper, "html.parser")

    #WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,'gs_top')))
    if len(soup_JF_paper) != 0 and soup_JF_paper.body!=None:
        captcha_check = soup_JF_paper.body.find_all('form',id ='gs_captcha_f')+soup_JF_paper.body.find_all('form',id ='captcha-form')
    else:
        captcha_check = []
    if len(captcha_check)!=0:
        pause = input("Clear CAPTCHA: ")

#### c. Main function

In [None]:
# Main function that utilizes all the functions above and collects the working paper titles and information of citing papers for all the JFE papers of our interest. 

def run(START,userAgent,low):
    #prox = Proxy()
    #prox.proxy_type = ProxyType.MANUAL
    #prox.http_proxy = P[sel]
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument(f'user-agent={userAgent}')
    #chrome_options.proxy = prox
    driver = webdriver.Chrome(options=chrome_options)
    gs_url = 'https://scholar.google.com'
    driver.get(gs_url)
    #pause = input("CHANGE LANGUAGE TO ENGLISH!")
    cap_check(driver)

    low1 = low.iloc[START:,:]
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID,'gs_hdr_tsi')))
    el = driver.find_element(By.ID,'gs_hdr_tsi')
    el.clear()
    el.send_keys(low1.iloc[0,0]+' '+low1.iloc[0,1])
    el.send_keys(Keys.RETURN)

    cap_check(driver)

    for t in range(len(low1.iloc[:,0])):
        manual_next = t
        try: 

            start_time = time.time()
            # GET JF PAPER INFO 
            time.sleep(np.random.randint(1,5))
            ready = False

            WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.ID,'gs_top')))
            data_JF_paper = driver.page_source
            soup_JF_paper  = BeautifulSoup(data_JF_paper, "html.parser")

            ###########################
            # CHECK FOR CAPTCHA [When searching for JF PAPER]
            if len(soup_JF_paper) !=0:
                captcha_check = soup_JF_paper.body.find_all('form',id ='gs_captcha_f')+soup_JF_paper.body.find_all('form',id ='captcha-form')
            else:
                print("soup 0")
                captcha_check = []
            try: 
                if len(captcha_check)!=0:
                    pause = input("Clear CAPTCHA: ")
                    #captcha_num += 1
                    # Double check if cleared:
                    data_JF_paper = driver.page_source
                    soup_JF_paper  = BeautifulSoup(data_JF_paper, "html.parser")
                    captcha_check = soup_JF_paper.body.find_all('form',id ='gs_captcha_f')+soup_JF_paper.body.find_all('form',id ='captcha-form')
                    print("cleared capthca!",len(captcha_check))
                    #print(captcha_num)
            except Exception as e: 
                print('No captcha', e)
                driver.quit()
                driver = webdriver.Chrome(options=chrome_options)
                gs_url = 'https://scholar.google.com'
                driver.get(gs_url)
                pause = input("CURRENT PAGE SHOULD BE GOOGLE SCHOLARS!! : ")
                WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID,'gs_hdr_tsi')))
                el = driver.find_element(By.ID,'gs_hdr_tsi')
                el.clear()
                el.send_keys(low1.iloc[t,0]+' '+low1.iloc[t,1])
                el.send_keys(Keys.RETURN)
                pause = input("Search for Paper!! : ")
                data_JF_paper = driver.page_source
                soup_JF_paper  = BeautifulSoup(data_JF_paper, "html.parser")
            ###########################
            gs_soup_bdy = soup_JF_paper.body.find_all("div",class_="gs_ri")
            paper_title = [i.find_all("h3",class_ = "gs_rt")[0].text for i in gs_soup_bdy]
            other_info = [i.find_all("div",class_ = "gs_a")[0].text for i in gs_soup_bdy]
            A = [low1.iloc[t,0].lower() in i.lower() for i in paper_title]
            B = ['journal' in i for i in [[j.lower() for j in i.split(' ')] for i in other_info]]
            dff = pd.DataFrame([A,B]).T
            try: 
                if len(dff[dff.all(axis=1)].index)==0:
                    # CONFIRM JF PAPER MISSING 
                    pause = input("Confirm missing JF Paper ")
                    if pause == 'y':
                        print("NO JF PAPER FOUND")
                        missing.append(START+t) 
                        next = True
                    elif pause == 'n':
                        ind_ = input("Index of paper! Int input:")
                        ind = int(ind_)
                        next = False
                    else:
                        print("NO JF PAPER FOUND")
                        missing.append(START+t) 
                        next = True
                else:
                    ind = dff[dff.all(axis=1)].index[0]
                    next = False
            except:
                B_ = ['journal' in i for i in [[j.lower() for j in i.split(' ')] for i in other_info]]
                ind = np.where(B_)[0][0]
                next = False
            # Execute if JF Paper Found 
            if next == False: 
                print(ind)
                gs_soup_bdy = gs_soup_bdy[ind]
                ###########################
                ###########################
                gs_soup_find = gs_soup_bdy.find_all("div",class_ = 'gs_fl gs_flb')
                gs_hyperlinks_JF_paper  = [a for a in gs_soup_find[0].find_all("a", href=True)]
                # CHECK IF VERSIONS AND CITED BY EXISTS
                X = [i.text.split(' ') for i in gs_hyperlinks_JF_paper]
                # 1. Version Names
                ver = ['versions' in i for i in X]
                if np.array(ver).sum() == 0:
                    #versions_link = None
                    versions_num = None
                else:
                    #versions_link  = requests.get('https://scholar.google.com' + gs_hyperlinks[4]['href']).url
                    versions_num = gs_hyperlinks_JF_paper[np.where(ver)[0][0]].text.split(' ')[1]
                    # NUMEBER OF WORKING VERSIONS
                    print("Number of working verison papers",versions_num)
                    # SCRAP NAMES OF WORKING PAPERS
                    WPN.append(wp_for_one_paper(versions_num,driver,low1.iloc[t,0],ind))
                #####################################################################################################################
                #####################################################################################################################
                # 2. Cited by 
                #cited_by_link  = requests.get('https://scholar.google.com' + gs_hyperlinks[2]['href']).url
                cb = ['Cited' in i for i in X]
                if np.array(cb).sum() == 0:
                    #versions_link = None
                    cited_by_num = None
                    cited_by_num_list.append(0)
                    print('no citing papers')
                else: 
                    cited_by_num = gs_hyperlinks_JF_paper [np.where(cb)[0][0]].text.split(' ')[-1]
                    print("Number of Citations", cited_by_num)
                    cited_by_num_list.append(cited_by_num)
                    # SCRAP CITING PAPER INFO 
                    # HERE, FIND CORRECT JF PAPER 
                    C = cp_for_one_paper(cited_by_num,driver,low1.iloc[t,0],ind,t,low1)
                    CB.append(C[0])
            else: # JF PAPER NOT FOUND
                retain.append(START+t)
            # RECORD TIME 
            end_time = time.time()
            duration = end_time - start_time
            print(duration)
            loop_time.append(duration)
            try: 
                if C[1] == True:
                    print('move on!')
                    manual_next = manual_next + 1
            except:
                pass
            # MOVE TO NEXT PAGE 
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID,'gs_hdr_tsi')))
            el = driver.find_element(By.ID,'gs_hdr_tsi')
            time.sleep(np.random.uniform(1,2))
            el.clear()
            time.sleep(np.random.uniform(1,2))
            if t+1 <= len(low1.iloc[:,0]):
                el.send_keys(low1.iloc[t+1,0]+' '+low1.iloc[t+1,1])
                time.sleep(np.random.uniform(0,2)) 
                el.send_keys(Keys.RETURN)
                print("NEXT JF PAPER")
            else:
                pass # END OF JF PAPERS
        except Exception as e:
            print(e)
            print('current paper number', manual_next)
            return manual_next

### II. Running our program

In [None]:
# We manually set the index of JFE paper from where we start/resume our search and scraping process. 
# On our first run, we would set this to 0. Afterwards, we start from the index after that of the paper for which we last completed our scraping. 

START = 1971

In [None]:
# Check which index we are at: 

START

Word of caution: 

The functions above save the citations data to the intiialized lists in the cell below. Thus, we have to be aware of a few things before running the cell below: 
* Make sure to save the lists into variables or files before running. If not, the downloaded data will be deleted.  
* If saving the data into files, make sure the files are different. Otherwise, it will be overwritten and the previously saved data will be erased.

In [None]:
# Initialize the lists where we will be saving our results.

WPN = []
CB = []
loop_time = []
cited_by_num_list = []
captcha_num = 0 
retain = []
retain_df = []
confirm = []
missing = []

In [None]:
# Run! 
# When it comes across an error, ask to restart the program or stop for now in case we want to save our results first. 

while START<len(jfe.iloc[0:,:]): 
    ua = UserAgent()
    userAgent = ua.random
    t = run(START,userAgent,jfe)
    START = START + t
    print('restart at', START)
    #select = select +1
    ask = input(str(START) + "Stop program? y to stop:")
    if ask == 'y':
        break

In [None]:
# Organize our data into dataframes and save.

cb_hh = pd.concat(CB).drop_duplicates()
wpn_hh = pd.DataFrame(WPN)
run_hh = pd.DataFrame([loop_time,cited_by_num_list]).T

cb_hh.to_excel("Citations_JFE_13.xlsx")

If we lost count of where we are, we can check the last saved dataset to know where we need to restart our downloading process. 

Example: 

In [None]:
df = pd.read_excel("Citations_JFE_11.xlsx")

In [None]:
df.iloc[-1,-1]

'Regulatory pressure and fire sales in the corporate bond market'

In [None]:
jfe1 = jfe.reset_index()
jfe1[jfe1.Title == "Regulatory pressure and fire sales in the corporate bond market"]

Unnamed: 0,index,Title,Authors,Date
1705,6,Regulatory pressure and fire sales in the corp...,"Andrew Ellul, Chotibhak Jotikasthira, Christia...",September 2011


### III. Results 

#### Combine pieces of our data into one

In [None]:
df0 = pd.read_excel("Citations_JFE.xlsx")
df1 = pd.read_excel("Citations_JFE_1.xlsx")
df2 = pd.read_excel("Citations_JFE_2.xlsx")
df3 = pd.read_excel("Citations_JFE_3.xlsx")
df4 = pd.read_excel("Citations_JFE_4.xlsx")
df5 = pd.read_excel("Citations_JFE_5.xlsx")
df6 = pd.read_excel("Citations_JFE_6.xlsx")
df7 = pd.read_excel("Citations_JFE_7.xlsx")
df8 = pd.read_excel("Citations_JFE_8.xlsx")
df9 = pd.read_excel("Citations_JFE_9.xlsx")
df10 = pd.read_excel("Citations_JFE_10.xlsx")
df11 = pd.read_excel("Citations_JFE_11.xlsx")
df12 = pd.read_excel("Citations_JFE_12.xlsx")
df13 = pd.read_excel("Citations_JFE_13.xlsx")

citing_info_1 = pd.concat([df0,df1,df2,df3,df4,df5,df6,df7,df8,df9,
                           df10,df11,df12,df13]).iloc[:,1:]

#### Data Cleaning 

In [None]:
# Function to clean the title by removing unnecessary letters/punctions

def clean_title(title):
    # Remove patterns like [xxx]
    cleaned_title = re.sub(r'\[.*?\]', '', title)
    # Strip leading and trailing whitespaces
    return cleaned_title.strip()
# Apply the cleaning function to the first column
citing_info_1['Cleaned Titles'] = citing_info_1.iloc[:, 0].apply(clean_title)

# Extracting authors, journal, and year information 
citing_info_1['Authors'] = citing_info_1.iloc[:, 1].str.extract(r'^(.*?)\-')[0]
citing_info_1['Authors'].fillna(citing_info_1.iloc[:, 1], inplace=True)
citing_info_1['Journal of Publication'] = citing_info_1.iloc[:, 1].str.extract(r'\- (.*?),')
citing_info_1['Year of Publication'] = citing_info_1.iloc[:, 1].str.extract(r'(\d{4})')

df = citing_info_1.iloc[:,2:]
df.head()
# Correcting the issue by ensuring the maximum number of authors is an integer
max_authors = int(df['Authors'].str.count(',').max()) + 1

# Recreating separate columns for each author
df_authors_expanded = df['Authors'].str.split(',', expand=True, n=max_authors)

# Naming the author columns again
author_columns = ['Author {}'.format(i+1) for i in range(max_authors)]
df_authors_expanded.columns = author_columns

# Splitting each author's name into first and last names again
for author_col in author_columns:
    first_name_col = '{} First Name'.format(author_col)
    last_name_col = '{} Last Name'.format(author_col)
    df_authors_expanded[[first_name_col, last_name_col]] = df_authors_expanded[author_col].str.rsplit(' ', n=1, expand=True)

# Display the result
df_authors_expanded.tail()

df_ref = pd.concat([df,df_authors_expanded.iloc[:,7:]],axis = 1)

In [None]:
# Resulting dataset of citing papers on Google Scholar

df_ref

Unnamed: 0,cited_paper,Authors,Journal of Publication,Year of Publication,Cleaned Titles,Author 1 First Name,Author 1 Last Name,Author 2 First Name,Author 2 Last Name,Author 3 First Name,Author 3 Last Name,Author 4 First Name,Author 4 Last Name,Author 5 First Name,Author 5 Last Name,Author 6 First Name,Author 6 Last Name,Author 7 First Name,Author 7 Last Name
0,Delayed crises and slow recoveries,"A Krishnamurthy, W Li",,2020,dissecting mechanisms of financial crises: int...,A,Krishnamurthy,W Li,,,,,,,,,,,
1,Delayed crises and slow recoveries,W Li,USC Marshall School of Business Research Paper,2019,public liquidity and financial crises,W,Li,,,,,,,,,,,,
2,Delayed crises and slow recoveries,"Z Li, S Xu",,,inefficient credit cycles,Z,Li,S,Xu,,,,,,,,,,
3,Learning about the consumption risk exposure o...,"K Li, CY Tsou, C Xu",Journal of Monetary Economics,2023,learning and the capital age premium,K,Li,CY,Tsou,C,Xu,,,,,,,,
4,Learning about the consumption risk exposure o...,K Schneider,Available at SSRN 4583435,4583,"investment, uncertainty, and u-shaped return v...",K,Schneider,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34324,Financing patterns around the world: Are small...,F Şahut,,2021,kobi̇'lerin finansman sorunları ve çözüm öneri...,F Şahut,,,,,,,,,,,,,
34325,Financing patterns around the world: Are small...,C Viorica,Autoreferatul tezei de doctor în științe econo...,2016,finanțarea întreprinderilor mici și mijlocii d...,C,Viorica,,,,,,,,,,,,
34326,Financing patterns around the world: Are small...,MA dos Santos,Revista Inteligência Competitiva,2019,relação entre estrutura de financiamento e açõ...,MA dos,Santos,,,,,,,,,,,,
34327,Financing patterns around the world: Are small...,粟芳， 初立苹,金融经济学研究,2014,中国商业银行综合融资能力测度及影响因素分析,粟芳，,初立苹,,,,,,,,,,,,


In [None]:
# Save dataset of papers citing JFE papers on Google Scholar 

df_ref.to_excel("JFE_GS_DATA.xlsx")

Limitation: We note an important limitation of the citation network of citing papers on Google Scholar built through web scraping. 

Based on the Google Scholar help support, Google Scholar does not keep a record of more than 1000 citing papers for each paper. Therefore, the universe of citing papers would not be fully captured. 

reference: https://scholar.google.ca/intl/en/scholar/help.html#export