<a href="https://colab.research.google.com/github/mell00/gradcafe-stats/blob/main/gradstats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

def scrape_gradcafe(base_url="https://www.thegradcafe.com/survey/index.php", max_pages=2):
    """
    Scrapes GradCafe, capturing GRE/GRE V/GRE AW/GPA scores if they appear in
    the 'tag row' (the row with class="tw-border-none") for each result.

    Adds debug prints to confirm data was found.
    If you don’t need debug prints, you can remove them or comment them out.
    """
    all_data = []

    for page_num in range(1, max_pages+1):
        page_url = f"{base_url}?p={page_num}"
        print(f"\n====================================")
        print(f"Scraping page {page_num}: {page_url}")
        print(f"====================================\n")

        # Request the page
        response = requests.get(page_url, timeout=10)
        if response.status_code != 200:
            print(f"[!] Skipping page {page_num}, status {response.status_code}")
            continue

        # Optional debug: see if “GRE” is even in the HTML
        if "GRE" in response.text:
            print("DEBUG: 'GRE' substring found in HTML for this page.")
        else:
            print("DEBUG: No 'GRE' substring found in HTML for this page.")

        soup = BeautifulSoup(response.text, "html.parser")

        # The main table
        results_table = soup.select_one("table.tw-min-w-full.tw-divide-y.tw-divide-gray-300")
        if not results_table:
            print("[!] Could not locate results table. Possibly no results.")
            continue

        # Gather all table rows
        rows = results_table.select("tbody tr")
        print(f"DEBUG: Found {len(rows)} <tr> elements on page {page_num}.")

        i = 0
        while i < len(rows):
            print(f"\nDEBUG: Processing main row i={i} ...")

            main_row = rows[i]
            main_classes = main_row.get("class", [])
            print(f"DEBUG: main_row classes={main_classes}")

            tds = main_row.select("td")
            if len(tds) < 4:
                print("[!] Fewer than 4 <td> in main_row; skipping.")
                i += 1
                continue

            # Extract core info from the 4 columns
            # Column 0 => school
            school_div = tds[0].select_one("div.tw-font-medium.tw-text-gray-900.tw-text-sm")
            school = school_div.get_text(strip=True) if school_div else ""

            # Column 2 => "Added On" date
            date_posted = tds[2].get_text(strip=True)

            # Column 3 => "Decision" text (like "Accepted on 24 Dec")
            decision_div = tds[3].select_one("div")
            decision_text = decision_div.get_text(strip=True) if decision_div else ""

            # Split out decision_type vs. decision_date (like “Accepted” vs. “24 Dec”)
            decision_type = decision_text
            decision_date = ""
            if " on " in decision_text:
                parts = decision_text.split(" on ", 1)
                decision_type = parts[0].strip()
                decision_date = parts[1].strip()

            print(f"DEBUG: School={school!r}, date_posted={date_posted!r}, "
                  f"decision={decision_type!r}, decision_date={decision_date!r}")

            # Next, we look for the “tag row” and optional “comment row”
            tag_row = None
            comment_row = None

            # If next row is "tw-border-none", it’s the tag row.
            if i+1 < len(rows) and "tw-border-none" in rows[i+1].get("class", []):
                tag_row = rows[i+1]
                print(f"DEBUG: Found tag_row at i+1 => row index={i+1}")

                # Possibly a comment row right after that (if next row is also tw-border-none)
                if i+2 < len(rows) and "tw-border-none" in rows[i+2].get("class", []):
                    comment_row = rows[i+2]
                    print(f"DEBUG: Found comment_row at i+2 => row index={i+2}")
                    i += 3
                else:
                    i += 2
            else:
                print("DEBUG: No tw-border-none row next => increment i by 1.")
                i += 1

            # Gather tags from the “tag row”
            tags_text = []
            if tag_row:
                tag_tds = tag_row.select("td")
                if tag_tds:
                    # Each inline-flex tag is in .tw-inline-flex
                    tag_divs = tag_tds[0].select("div.tw-inline-flex")
                    print(f"DEBUG: Found {len(tag_divs)} div.tw-inline-flex in tag_row")
                    for div in tag_divs:
                        text_val = div.get_text(strip=True)
                        print(f"DEBUG: => Tag: {text_val!r}")
                        tags_text.append(text_val)
            else:
                print("DEBUG: Skipping GRE parse => no tag_row")

            # Regex parse GRE + GPA from tags
            gre_total = ""
            gre_v = ""
            gre_aw = ""
            gpa = ""

            for tag_item in tags_text:
                text_lower = tag_item.lower()
                print(f"DEBUG: Checking tag_item={tag_item!r}")

                # GRE total, e.g. "GRE 324"
                match_total = re.search(r'\bgre\D*(\d+(\.\d+)?)\b', text_lower)
                if match_total:
                    gre_total = match_total.group(1)
                    print(f"DEBUG:  => Found GRE_Total={gre_total}")

                # GRE V, e.g. "GRE V 156"
                match_v = re.search(r'\bgre\s+v\s+(\d+(\.\d+)?)\b', text_lower)
                if match_v:
                    gre_v = match_v.group(1)
                    print(f"DEBUG:  => Found GRE_V={gre_v}")

                # GRE AW, e.g. "GRE AW 4.50"
                match_aw = re.search(r'\bgre\s+aw\s+(\d+(\.\d+)?)\b', text_lower)
                if match_aw:
                    gre_aw = match_aw.group(1)
                    print(f"DEBUG:  => Found GRE_AW={gre_aw}")

                # GPA, e.g. "GPA 3.07"
                match_gpa = re.search(r'\bgpa\s+(\d+(\.\d+)?)\b', text_lower)
                if match_gpa:
                    gpa = match_gpa.group(1)
                    print(f"DEBUG:  => Found GPA={gpa}")

            # Possibly parse out the “comment row”
            comment_text = ""
            if comment_row:
                comment_tds = comment_row.select("td")
                if comment_tds:
                    c_div = comment_tds[0].select_one("p.tw-text-gray-500.tw-text-sm.tw-my-0")
                    if c_div:
                        comment_text = c_div.get_text(strip=True)
                        print(f"DEBUG: Found comment={comment_text!r}")

            # Build record
            record = {
                "School":        school,
                "Date_Posted":   date_posted,
                "Decision_Type": decision_type,
                "Decision_Date": decision_date,
                "GRE_Total":     gre_total,
                "GRE_V":         gre_v,
                "GRE_AW":        gre_aw,
                "GPA":           gpa,
                "Tags":          tags_text,
                "Comment":       comment_text
            }
            all_data.append(record)

        # End while
        time.sleep(1.0)  # small polite delay

    # Combine into DataFrame
    df = pd.DataFrame(all_data)
    return df

if __name__ == "__main__":
    # Example usage: scrape 2 pages
    df_results = scrape_gradcafe(max_pages=3)
    print("\n============================================")
    print("Finished scraping.  DataFrame preview:\n")
    print(df_results.head(60))
    print(f"Total rows: {len(df_results)}")

    # Save to CSV if you like
    df_results.to_csv("gradcafe_with_GRE_GPA.csv", index=False)
    print("\nSaved gradcafe_with_GRE_GPA.csv.")



Scraping page 1: https://www.thegradcafe.com/survey/index.php?p=1

DEBUG: No 'GRE' substring found in HTML for this page.
DEBUG: Found 53 <tr> elements on page 1.

DEBUG: Processing main row i=0 ...
DEBUG: main_row classes=[]
DEBUG: School='Penn State University', date_posted='December 29, 2024', decision='Interview', decision_date='29 Dec'
DEBUG: Found tag_row at i+1 => row index=1
DEBUG: Found 4 div.tw-inline-flex in tag_row
DEBUG: => Tag: 'Interview on 29 Dec'
DEBUG: => Tag: 'Fall 2025'
DEBUG: => Tag: 'International'
DEBUG: => Tag: 'GPA 3.89'
DEBUG: Checking tag_item='Interview on 29 Dec'
DEBUG: Checking tag_item='Fall 2025'
DEBUG: Checking tag_item='International'
DEBUG: Checking tag_item='GPA 3.89'
DEBUG:  => Found GPA=3.89

DEBUG: Processing main row i=2 ...
DEBUG: main_row classes=[]
DEBUG: School='University of Delaware', date_posted='December 29, 2024', decision='Accepted', decision_date='23 Dec'
DEBUG: Found tag_row at i+1 => row index=3
DEBUG: Found comment_row at i+2 => ro