In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urlencode, urlparse, parse_qs, urlunparse
import requests
import csv
import time

In [28]:
URL = "https://stackoverflow.com/questions/tagged/virtual-reality"
ANSWER_URL="https://stackoverflow.com"
PAGE_LIMIT = 119
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [29]:
def build_url(base_url=URL, tab='active', page=1):
    return f"{base_url}?tab={tab}&page={page}"

In [30]:
def findAnswer(link, answers_count):
  response = requests.get(link, headers=HEADERS)
  response.raise_for_status()

  page_questions = []
  soup = BeautifulSoup(response.text, "html.parser")
  if int(answers_count)<2:
    # answer_summary = soup.find("div", class_="post-layout")
    answer_summary=soup.find("div",class_="answercell")
    answer_class=answer_summary.find("div", class_="s-prose js-post-body")
    # for answer_div in answer_summary:
    answer_text = answer_class.get_text()
  else:
    answer_summary = soup.find_all("div", class_="post-layout")
    for summery in answer_summary:
      check_tick_class = summery.find("div",class_="votecell post-layout--left")
      check_tick=check_tick_class.find("div",class_="d-none")
      if not check_tick:
        answer_class=summery.find("div", class_="s-prose js-post-body")
        answer_text = answer_class.get_text()
  return answer_text

In [31]:
def finDesc(link):
  response = requests.get(link, headers=HEADERS)
  response.raise_for_status()

  page_questions = []
  soup = BeautifulSoup(response.text, "html.parser")
  qd = soup.find("div", class_="postcell post-layout--right")
  qdc=qd.find("div", class_="s-prose js-post-body")
  # for answer_div in answer_summary:
  desc = qdc.get_text()
  return desc

In [32]:
def scrape_page(page=1, retries=500):
    session = requests.Session()
    url = build_url(page=page)
    print(url)
    for i in range(retries):
        try:
            response = session.get(url, headers=HEADERS)
            response.raise_for_status()

            page_questions = []
            soup = BeautifulSoup(response.text, "html.parser")
            question_summary = soup.find_all("div", class_="s-post-summary")

            for summary in question_summary:
                answers_element = summary.find_all("span", class_="s-post-summary--stats-item-number")[1]
                answers_count = answers_element.text.strip() if answers_element else "0"

                answers_tick = summary.find_all("svg", class_="svg-icon iconCheckmarkSm")
                has_tick = bool(answers_tick)
                if has_tick:
                  question_element = summary.find("a", class_="s-link")
                  question = question_element.get_text(strip=True) if question_element else None
                  question_href = question_element['href'] if question_element else None
                  link = f"{ANSWER_URL}{question_href}"
                  desciption = finDesc(link)
                  answer_text = findAnswer(link, answers_count)
                  page_questions.append({
                      'Question': question,
                      # 'Desciption':desciption,
                      # 'Answer': answer_text,
                      # 'Answer count':answers_count
                  })
            return page_questions
        except requests.exceptions.HTTPError as err:
            if err.response.status_code == 429:
                retry_after = int(err.response.headers.get("Retry-After", 1))
                print(f"Rate limited. Retrying after {retry_after} seconds.")
                time.sleep(retry_after)
            else:
                raise
        except requests.exceptions.RequestException as e:
            time.sleep(2 ** i)  # exponential backoff
    raise Exception("Failed to fetch the page after several retries")

In [40]:
def scrape(page_limit=PAGE_LIMIT):
    """
    Function to scrape up to the specified PAGE_LIMIT.
    """
    questions = []
    for i in range(100, page_limit + 1):
        print(f"Scraping page {i}...")
        page_questions = scrape_page(i)
        if not page_questions:
            print(f"No questions found on page {i}.")
        questions.extend(page_questions)
    return questions

In [34]:
def export_to_csv(data, filename="/content/drive/MyDrive/LLM Projects/Data Extraction of stack overflow/VR.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Question","Desciption","Answer","Answer count"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in data:
            writer.writerow(item)

In [41]:
if __name__ == "__main__":
    scraped_data = scrape()
    # Print the scraped data
    pprint(scraped_data)
    # Export the scraped data to a CSV file
    export_to_csv(scraped_data)

[]
