In [322]:
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import json
from serpapi import GoogleSearch, GoogleScholarSearch
import os
from urllib.parse import urlsplit, parse_qsl
import pandas as pd

In [325]:
class ScholarListener:
    def __init__(self,query_list=None,save_to_json=True,save_to_csv=False,return_output_object=False,mauthors=None,affiliations=None,author_id=None,query=None):
        self.query_list = query_list
        self.return_output_object = return_output_object
        self.save_to_json = save_to_json
        self.save_to_csv = save_to_csv
        self.mauthors = mauthors
        self.affiliations=affiliations
        self.author_id= author_id
        self.query=query

    def serpapi_scrape_organic_results(self):
        params = {
        "api_key": "",
        "engine": "google_scholar",
        "q": self.query,
        }

        search = GoogleScholarSearch(params)
        results = search.get_dict()
        outward= {}
        # It looks a bit akward but the point is that you can grab everything you need in 2-3 lines of code as below.
        for result in results['organic_results']:
            try:
                title= result['title']
            except:
                continue
            try:
                publication_info=result['publication_info']['summary']
            except:
                 publication_info="NA"
            try:
                snippet=result['snippet']
            except:
                snippet="NA"
            try:
                cited_by=result['inline_links']['cited_by']['link']
            except:
                cited_by="NA"
            try:
                related_versions=result['inline_links']['related_pages_link']
            except:
                related_versions="NA"


            outward[title]={
                  'title': title,
                  'publication_info':publication_info,
                  'snippet': snippet,
                  'cited_by': cited_by,
                  'related_versions': related_versions
                    }
        return outward

    def serpapi_scrape_author_result(self):
        params = {
            "api_key": "",
            "engine": "google_scholar_author",
            "author_id": self.author_id,
            "hl": "en",
        }

        search = GoogleScholarSearch(params)
        results = search.get_dict()
        print(results)
        # Author info
        print(results)
        name = results['author']['name']
        affiliations = results['author']['affiliations']
        email = results['author']['email']
        interests1 = results['author']['interests'][0]['title']
        interests2 = results['author']['interests'][1]['title']

        print('Author Info:')
        print(f'{name}\n{affiliations}\n{email}\n{interests1}\n{interests2}\n')

              # Articles Results
        for article in results['articles']:
            article_title = article['title']
            article_link = article['link']
            article_authors = article['authors']
            article_publication = article['publication']
            cited_by = article['cited_by']['value']
            cited_by_link = article['cited_by']['link']
            article_year = article['year']
            print('Articles Info:')
            print(f"Title: {article_title}\nLink: {article_link}\nAuthors: {article_authors}\nPublication: {article_publication}\nCited by: {cited_by}\nCited by link: {cited_by_link}\nPublication year: {article_year}\n")

        # Cited By and Public Access Results
        citations_all = results['cited_by']['table'][0]['citations']['all']
        h_inedx_all = results['cited_by']['table'][1]['h_index']['all']
        i10_index_all = results['cited_by']['table'][2]['i10_index']['all']

        print('Citations Info:')
        print(f'{citations_all}\n{h_inedx_all}\n{i10_index_all}\n')
        try:
            public_access_link = results['public_access']['link']
        except:
            public_access_link = "NA"
        try:
            public_access_available_articles = results['public_access']['available']
        except:
            public_access_available_articles = "NA"


        print('Public Access Info:')
        print(f'{public_access_link}\n{public_access_available_articles}\n')

        # Co-Authors Results
        for authors in results['co_authors']:
            print(authors)
            author_name = authors['name']
            try:
                author_affiliations = authors['affiliations']
            except:
                author_affiliations = "NA"
            try:
                author_link = authors['link']
            except:
                author_link = "NA"

            print('Co-Authour(s):')
            print(f'{author_name}\n{author_affiliations}\n{author_link}\n')
        
    def serpapi_author_profile(self):
        params = {
            "api_key": "",
            "engine": "google_scholar_profiles",
            "hl": "en",
            "mauthors": self.mauthors,
            "affiliations":self.affiliations
        }

        search = GoogleScholarSearch(params)
    #     results = search.get_response()
        results = search.get_dict()
        outputs = {}
        author_profiles=results["profiles"]
        for result in author_profiles:
            try:
                self.affiliations.strip()
                if result["affiliations"].strip().lower() ==f"{self.affiliations}".lower():
                    outputs[result["name"]]=result
                else:
                    continue
            except:
                outputs[result["name"]]=result   
        return outputs
    
    def scholar_search(self):
        '''
        query_list = ["Sehanobish Corzo Kara", "Learning potentials of 
        quantum systems using deep neural networks", "2006.13297" ] 
        '''
        
        scraped_time = datetime.now().strftime("%h-%d-%Y")
        output = {}
        headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/601.3.9 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,'referer':'https://www.google.com/'}
#         headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
        ql = self.query_list
        for query in ql:
            output_list = []
            url_1 = f'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={query}&btnG='
            url_2 = f'https://scholar.google.com/scholar?start=10&q={query}&hl=en&as_sdt=0,31'
            url_3 = f'https://scholar.google.com/scholar?start=20&q={query}&hl=en&as_sdt=0,31'
            urls = [url_1,url_2,url_3] # first three search result pages
            for url in urls:
                response=requests.get(url,headers=headers)
                statuscode=response.status_code
                if statuscode != 200:
                    raise Exception(f"requests response for url query: {url} returned {statuscode} response -- check google hasn't blocked your IP")
                    return
                soup=BeautifulSoup(response.content,'lxml')
                links = soup.find_all("h3", class_="gs_rt")
                print(links)
                for j in links:
                    the_url = re.search("href=\"(.*?)\"\s",str(j),re.DOTALL)[1]
                    output_list.append(the_url)
                    prin(the_url)
            output[query] = output_list
        
        #object output and saving results
        if self.save_to_json == True:
            name_concat_queries = "_".join(self.query_list)
            with open(f"{name_concat_queries}--{scraped_time}","w") as file:
                json.dump(output,file)
        
        if self.save_to_csv == True:
            import pandas as pd
            for l in output.keys():
                df = pd.DataFrame.from_dict(output[l])
                df.to_csv(f"{l}.csv")
                
        if self.return_output_object == True:
            return output
        else:
            return            



In [326]:
p=ScholarListener()
p.author_id="IzsyeRAAAAAJ"
p.mauthor= "kara, onur"
p.serpapi_scrape_author_result()


https://serpapi.com/search
{'search_metadata': {'id': '6262787bc573d51ab2aace82', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/9ccb22f125107f67/6262787bc573d51ab2aace82.json', 'created_at': '2022-04-22 09:42:19 UTC', 'processed_at': '2022-04-22 09:42:19 UTC', 'google_scholar_author_url': 'https://scholar.google.com/citations?user=IzsyeRAAAAAJ&hl=en', 'raw_html_file': 'https://serpapi.com/searches/9ccb22f125107f67/6262787bc573d51ab2aace82.html', 'total_time_taken': 1.8}, 'search_parameters': {'engine': 'google_scholar_author', 'author_id': 'IzsyeRAAAAAJ', 'hl': 'en'}, 'author': {'name': 'Onur Kara', 'affiliations': 'Hindsight Technology Solutions', 'email': 'Verified email at hindsightsolutions.net', 'website': 'http://mlokphys.com.s3-website-us-east-1.amazonaws.com/', 'interests': [{'title': 'network science', 'link': 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:network_science', 'serpapi_link': 'https://serpapi.com/search.jso

In [319]:
p.query="Sehanobish Corzo Kara"
p.serpapi_scrape_organic_results()

https://serpapi.com/search


{'Learning potentials of quantum systems using deep neural networks': {'title': 'Learning potentials of quantum systems using deep neural networks',
  'publication_info': 'A Sehanobish, HH Corzo, O Kara… - arXiv preprint arXiv …, 2020 - arxiv.org',
  'snippet': 'Attempts to apply Neural Networks (NN) to a wide range of research problems have been ubiquitous and plentiful in recent literature. Particularly, the use of deep NNs for understanding …',
  'cited_by': 'https://scholar.google.com/scholar?cites=11804469877329258063&as_sdt=5,36&sciodt=0,36&hl=en',
  'related_versions': 'https://scholar.google.com/scholar?q=related:T2YCiM_m0aMJ:scholar.google.com/&scioq=Sehanobish+Corzo+Kara&hl=en&as_sdt=0,36'},
 'Fine-tuning Vision Transformers for the Prediction of State Variables in Ising Models': {'title': 'Fine-tuning Vision Transformers for the Prediction of State Variables in Ising Models',
  'publication_info': 'O Kara, A Sehanobish, HH Corzo - arXiv preprint arXiv:2109.13925, 2021 - arxi

In [292]:
p.mauthors="kara, onur"
p.affiliations="Hindsight Technology Solutions"
p.serpapi_author_profile()

https://serpapi.com/search


{'Onur Kara': {'name': 'Onur Kara',
  'link': 'https://scholar.google.com/citations?hl=en&user=IzsyeRAAAAAJ',
  'serpapi_link': 'https://serpapi.com/search.json?author_id=IzsyeRAAAAAJ&engine=google_scholar_author&hl=en',
  'author_id': 'IzsyeRAAAAAJ',
  'affiliations': 'Hindsight Technology Solutions',
  'email': 'Verified email at hindsightsolutions.net',
  'cited_by': 8,
  'interests': [{'title': 'network science',
    'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Anetwork_science',
    'link': 'https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:network_science'},
   {'title': 'machine learning',
    'serpapi_link': 'https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Amachine_learning',
    'link': 'https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:machine_learning'},
   {'title': 'statistical physics',
    'serpapi_link': 'https://ser

In [320]:
p.query="Sehanobish Corzo Kara"
all_info = p.serpapi_scrape_organic_results()

https://serpapi.com/search


In [321]:
all_info


{'Learning potentials of quantum systems using deep neural networks': {'title': 'Learning potentials of quantum systems using deep neural networks',
  'publication_info': 'A Sehanobish, HH Corzo, O Kara… - arXiv preprint arXiv …, 2020 - arxiv.org',
  'snippet': 'Attempts to apply Neural Networks (NN) to a wide range of research problems have been ubiquitous and plentiful in recent literature. Particularly, the use of deep NNs for understanding …',
  'cited_by': 'https://scholar.google.com/scholar?cites=11804469877329258063&as_sdt=5,36&sciodt=0,36&hl=en',
  'related_versions': 'https://scholar.google.com/scholar?q=related:T2YCiM_m0aMJ:scholar.google.com/&scioq=Sehanobish+Corzo+Kara&hl=en&as_sdt=0,36'},
 'Fine-tuning Vision Transformers for the Prediction of State Variables in Ising Models': {'title': 'Fine-tuning Vision Transformers for the Prediction of State Variables in Ising Models',
  'publication_info': 'O Kara, A Sehanobish, HH Corzo - arXiv preprint arXiv:2109.13925, 2021 - arxi

In [293]:
p.query_list = ["Sehanobish Corzo Kara", "Learning potentials of quantum systems using deep neural networks", "2006.13297"]

In [294]:
p.query_list

['Sehanobish Corzo Kara',
 'Learning potentials of quantum systems using deep neural networks',
 '2006.13297']

In [295]:
p.save_to_csv = True

In [296]:
p.scholar_search()

Exception: requests response for url query: https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=Sehanobish Corzo Kara&btnG= returned 429 response -- check google hasn't blocked your IP

In [297]:
p.return_output_object= True

In [298]:
results = p.scholar_search()

Exception: requests response for url query: https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=Sehanobish Corzo Kara&btnG= returned 429 response -- check google hasn't blocked your IP

https://serpapi.com/search


10