In [1]:
!pip install serpapi
!pip install -q pandas
!pip install -q openpyxl
!pip install -q google-search-results

Collecting serpapi
  Downloading serpapi-0.1.5-py2.py3-none-any.whl (10 kB)
Installing collected packages: serpapi
Successfully installed serpapi-0.1.5
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for google-search-results (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import requests
import re
import time
from serpapi.google_search import GoogleSearch

In [3]:
import os
from google.colab import userdata
os.environ["SERPAPI_API_KEY"] = userdata.get('SERPAPI_API_KEY')

In [4]:
def fetch_google_scholar_results(query, num_results):
    papers = []
    params = {
        "engine": "google_scholar",
        "q": query,
        "api_key": os.environ["SERPAPI_API_KEY"],
        "start": 0,
        "num": 20  # Number of results per page
    }

    while len(papers) < num_results:
        search = GoogleSearch(params)
        results = search.get_dict()

        if 'organic_results' not in results:
            print("No more results found or API call failed.")
            break

        papers.extend(results['organic_results'])
        params["start"] += 20  # Move to the next page
        time.sleep(2)  # Respectful delay to avoid hitting rate limits

        if len(results['organic_results']) < 20:
            break  # Stop if fewer results are returned than requested per page

    return papers[:num_results]

In [6]:
def process_results(results):
    papers = []

    for result in results:
        title = result.get('title', 'N/A')
        authors = result.get('publication_info', {}).get('authors', [])
        citation = result.get('inline_links', {}).get('cited_by', {}).get('total', 'N/A')
        url = result.get('link', 'N/A')
        abstract = result.get('snippet', 'N/A')
        doi = result.get('inline_links', {}).get('doi', 'N/A')

        authors_str = ', '.join([author.get('name', 'N/A') for author in authors])

        papers.append({
            'Title': title,
            'Authors': authors_str,
            'Citations': int(citation) if citation != 'N/A' else 0,
            'URL': url,
            'Abstract': abstract,
            'DOI': doi
        })

    return papers

In [13]:
def generate_file_name(query):
    # Remove special characters
    sanitized_query = re.sub(r'[^\w\s]', '', query)
    # Truncate to the first 20 characters
    truncated_query = sanitized_query[:20].strip()
    # Replace spaces with underscores
    final_query = re.sub(r'\s+', '_', truncated_query)
    file_name = f"Google_Scholar_Search_{final_query}.xlsx"
    return file_name

In [10]:
def save_to_excel(papers, filename):
    if not papers:
        print("No papers to save.")
        return

    # Sort papers by citations (most to least)
    papers = sorted(papers, key=lambda x: x['Citations'], reverse=True)

    df = pd.DataFrame(papers)
    df.to_excel(filename, index=False)

    print(f"Saved {len(papers)} papers to {filename}")

In [14]:
def main():
    query = input("Enter your search query: ")
    num_results = int(input("Enter the number of results you want to fetch: "))
    print("Fetching results...")

    results = fetch_google_scholar_results(query, num_results)
    print(f"Fetched {len(results)} results.")

    if not results:
        print("No results found or API call failed.")
        return

    print("Processing results...")

    papers = process_results(results)
    print("Saving results to Excel...")

    file_name = generate_file_name(query)
    save_to_excel(papers, file_name)
    print("Done!")

if __name__ == "__main__":
    main()

Enter your search query: ("pull request" OR "pull requests") AND "GitHub repository" AND (impact OR "code quality" OR collaboration)
Enter the number of results you want to fetch: 50
Fetching results...
Fetched 50 results.
Processing results...
Saving results to Excel...
Saved 50 papers to Google_Scholar_Search_pull_request_OR_pull.xlsx
Done!
