In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}

url = "https://scholar.google.com/scholar?start=0&q=tirzepatide&hl=en&as_sdt=0,5"

max_retries = 5
retries = 0

while retries < max_retries:
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        doc = BeautifulSoup(response.text, 'html.parser')
        break
    elif response.status_code == 429:
        retries += 1
        print(f"Received 429 error. Retrying in {2 ** retries} seconds...")
        time.sleep(2 ** retries)  # Exponential backoff
    else:
        print(f"Unexpected error: {response.status_code}")
        break

response = requests.get(url, headers=headers)

doc = BeautifulSoup(response.text, 'html.parser')

In [None]:
def get_paperinfo(paper_url):
    response = requests.get(paper_url, headers=headers)

    if response.status_code != 200:
        print("Status Code Error: ", response.status_code)
        raise Exception("Failed to get the webpage, dude!")

    # Parse the webpage
    paper_doc = BeautifulSoup(response.text, 'html.parser')

    return paper_doc

def get_tags(doc):
    paper_tag = doc.select("[data-lid]")
    cite_tag = doc.select("[title=Cite] + a")
    link_tag = doc.find_all("h3", {"class":"gs_rt"})
    author_tag = doc.find_all("div", {"class":"gs_a"})

    return paper_tag, cite_tag, link_tag, author_tag

def get_papertitle(paper_tag):
    paper_names = []

    for tag in paper_tag:
        paper_names.append(tag.select('h3')[0].get_text())

    return paper_names

def get_link(link_tag):
    links = []

    for i in range(len(link_tag)):
        links.append(link_tag[i].select('a')[0]['href'])

    return links

def get_author_year_publi_info(authors_tag):
    years = []
    publication = []
    authors = []
    for i in range(len(authors_tag)):
        authortag_text = (authors_tag[i].text).split()
        # Use re.search to find a match for one or more digits
        year_match = re.search(r'\d+', authors_tag[i].text)

        # Check if a match was found
        if year_match:
            year = int(year_match.group())
            years.append(year)
        else:
            # Handle cases where no year is found (e.g., append a default value or continue)
            continue

        # Assuming the last element in the split text is the publication name
        publication.append(authortag_text[-1])

        # Assuming the first two elements in the split text are the author's first and last names
        author = authortag_text[0] + ' ' + re.sub(',', '', authortag_text[1])
        authors.append(author)

    return years, publication, authors

In [None]:
paper_repos_dict = {
                    'Paper Title' : [],
                    'Year' : [],
                    'Author' : [],
                    'Publication' : [],
                    'Url of paper' : [] }

# adding information in repository
def add_in_paper_repo(papername, year, author, publi, link):
    print(f"papername length: {len(papername)}")
    print(f"year length: {len(year)}")
    print(f"author length: {len(author)}")
    print(f"publi length: {len(publi)}")
    print(f"link length: {len(link)}")


    if not all(len(papername) == len(lst) for lst in [year, author, publi, link]):
        raise ValueError("All lists must have the same length")
    paper_repos_dict['Paper Title'].extend(papername)
    paper_repos_dict['Year'].extend(year)
    paper_repos_dict['Author'].extend(author)
    paper_repos_dict['Publication'].extend(publi)
    paper_repos_dict['Url of paper'].extend(link)

    return pd.DataFrame(paper_repos_dict)

In [None]:
for i in range (0,250,10):

  # get url for the each page
  url = "https://scholar.google.com/scholar?start={}&q=tirzepatide&hl=en&as_sdt=0,5".format(i)

  # function for the get content of each page
  doc = get_paperinfo(url)

  # function for the collecting tags
  paper_tag,cite_tag,link_tag,author_tag = get_tags(doc)

  # paper title from each page
  papername = get_papertitle(paper_tag)

  # year , author , publication of the paper
  year , publication , author = get_author_year_publi_info(author_tag)

  # url of the paper
  link = get_link(link_tag)

  # add in paper repo dict
  final = add_in_paper_repo(papername, year, author, publication, link)

In [None]:
csv = final[:300]
csv.to_csv('scrape.csv', index=False)
csv.head(5)