In [1]:
import xlrd
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
front_url = 'https://scholar.google.com/scholar?start='
back_url = '&hl=en&as_sdt=2005&sciodt=0,5&as_ylo=2016&cites=17089488661544618798&scipsc='

def get_url_content(page_num):
    url = ''.join(map(str, [front_url, page_num*10, back_url]))
    print('Parsing from ', url)
    content = requests.get(url).text
    return content

In [3]:
folder_num = '07'
root_name = "source_codes/google_scholar_search_"

def get_saved_content(page_num):
    path = ''.join(map(str, [root_name, str(folder_num), "/page_", str(page_num), ".txt"]))
    with open(path, encoding="utf8") as file:
        content = file.read()
        file.close()
    return content

In [4]:
# auto for auto extraction 
root_folder = ''.join(map(str, [root_name, str(folder_num)]))
num_of_items = len([name for name in os.listdir(root_folder) if os.path.isfile(os.path.join(root_folder, name))])-1
# num_of_items = 10

In [5]:
def parse_page_info(get_content_method, page_num):
    
    content = get_content_method(page_num)
    page = BeautifulSoup(content, 'lxml')

    title_list = []
    url_list = []
    source_list = []
    citation_list = []

    for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
        title_list.append(entry.a.text)
        url_list.append(entry.a['href'])

    for entry in page.find_all(attrs={"class": "gs_a"}):
        source_list.append(entry.text)

    for entry in page.find_all(attrs={"class": "gs_fl"}):
        if '[' not in entry.text:
            citation_list.append(entry.text)

    page_df = pd.DataFrame(
        {'title': title_list,
         'author-source': source_list,
         'url_link': url_list,
         'cited_by': citation_list
        })
    
    return page_df

In [6]:
summary_df = pd.DataFrame()

for i in range(0,num_of_items):
    print('Parsing page '+str(i)+'...')
    page_df = parse_page_info(get_saved_content, page_num=i)
    page_df['page_num'] = i
    summary_df = summary_df.append(page_df)
    summary_df['search_round'] = str(folder_num)
    
summary_df

Parsing page 0...
Parsing page 1...
Parsing page 2...
Parsing page 3...
Parsing page 4...


Unnamed: 0,title,author-source,url_link,cited_by,page_num,search_round
0,2018 n2c2 shared task on adverse drug events a...,"S Henry, K Buchan, M Filannino… - Journal of t...",https://academic.oup.com/jamia/article-abstrac...,Cited by 20 Related articles All 4 versions,0,07
1,Adverse drug events and medication relation ex...,"F Christopoulou, TT Tran, SK Sahu… - Journal o...",https://academic.oup.com/jamia/article-abstrac...,Cited by 10 Related articles All 9 versions,0,07
2,Identifying relations of medications with adve...,"X Yang, J Bian, R Fang, RI Bjarnadottir… - Jou...",https://academic.oup.com/jamia/article-abstrac...,Cited by 4 Related articles All 4 versions,0,07
3,Advancing the state of the art in automatic ex...,"Ö Uzuner, A Stubbs, L Lenert - 2020 - academic...",https://academic.oup.com/jamia/article-abstrac...,Cited by 1 Related articles All 3 versions,0,07
4,Extracting adverse drug event information with...,"T Miller, A Geva, D Dligach - Proceedings of t...",https://www.aclweb.org/anthology/W19-1903.pdf,Cited by 3 Related articles All 3 versions ...,0,07
...,...,...,...,...,...,...
17,Traitement Automatique de la Langue Biomédicale,A Névéol - 2018 - hal.archives-ouvertes.fr,https://hal.archives-ouvertes.fr/tel-02167096/,Related articles All 2 versions View as HTML,3,07
18,Prognostic immunohistochemical biomarkers of c...,"A Belkouz, TA Labeur, J Dierks, F Dijk… - Crit...",https://www.sciencedirect.com/science/article/...,Cited by 2 Related articles All 8 versions,3,07
19,Assessing electronic health record phenotypes ...,"SE Spratt, K Pereira, BB Granger… - Journal of...",https://academic.oup.com/jamia/article-abstrac...,Cited by 31 Related articles All 6 versions,3,07
0,The use of computerized clinical decision supp...,"P Bennett, NR Hardiker - Journal of the Americ...",https://academic.oup.com/jamia/article-abstrac...,Cited by 33 Related articles All 7 versions,4,07


In [7]:
summary_df.to_csv('output/google_scholar_search_'+str(folder_num)+'.csv', index=False)

In [8]:
# append only new rows
alltime_df = pd.read_csv('output/google_scholar_search_all.csv')
alltime_df = pd.concat([alltime_df, summary_df])
alltime_df = alltime_df.drop_duplicates(subset=['title', 'author-source'], keep='first')
alltime_df.to_csv('output/google_scholar_search_all.csv', index=False)

#### Archive Workings

In [None]:
page_num=5
content = get_saved_content(page_num)
page = BeautifulSoup(content, 'lxml')

title_list = []
url_list = []
source_list = []
citation_list = []

for entry in page.find_all(attrs={"class": "gs_fl"}):
    print(entry.text)
    if '[' not in entry.text:
        citation_list.append(entry.text)

print(len(title_list))
print(len(url_list))
print(len(source_list))
print(len(citation_list))

In [None]:
citation_list

In [None]:
page_df

In [None]:
content