In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import string

from google.colab import drive
drive.mount('/content/drive')

output_csv = "/content/drive/My Drive/crs_schedule_A_to_Z.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def fetch_table_data(letter, base_url, headers=None):
    url = base_url.format(letter)
    print(f"Fetching data for letter: {letter} from URL: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {letter}: HTTP {response.status_code}")
        return None, headers

    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table", id="tbl_schedule")
    if not table:
        print(f"No table found for letter {letter}.")
        return None, headers

    if headers is None:
        header_row = table.find("thead").find("tr")
        headers = [th.get_text(separator=" ", strip=True) for th in header_row.find_all("th")]
        print("Table headers:", headers)

    rows = []
    for tbody in table.find_all("tbody"):
        for row in tbody.find_all("tr"):
            cells = row.find_all("td")
            cell_texts = [cell.get_text(separator=" ", strip=True) for cell in cells]
            if len(cell_texts) == len(headers):
                row_data = dict(zip(headers, cell_texts))
            else:
                row_data = {"Raw Data": cell_texts}
            row_data["Letter"] = letter
            rows.append(row_data)

    return rows, headers

def split_schedule_instructors_remarks(entry):
    """
    Splits the 'Schedule Instructor(s) Remarks' column into:
    - Schedule: Everything before "Rm", "Room", "lec TBA", pure numbers, or semicolons.
    - Instructor(s): All-caps word(s), excluding "TBA", until no more valid all-caps words are found.
    - Remarks: Everything after the last instructor(s) or starting with specific keywords like "Hybrid" or "Most Online".
    """
    if pd.isnull(entry):
        return pd.Series([None, None, None])  # Handle missing data gracefully

    import re

    # Match the end of the Schedule using keywords, semicolons, or a pure number
    schedule_end_match = re.search(r'\b(?:Rm|Room|lec TBA|\d{3,}|TBA;|;)\b', entry)
    if schedule_end_match:
        schedule_end_index = schedule_end_match.start()
        schedule = entry[:schedule_end_index + len(schedule_end_match.group(0))].strip()  # Include full match in Schedule
        remaining = entry[schedule_end_index + len(schedule_end_match.group(0)):].strip()  # Everything after the match
    else:
        # If no match is found, treat the whole entry as Schedule
        return pd.Series([entry.strip(), None, None])

    # Match all-caps words for Instructor(s), excluding "TBA"
    instructors_match = re.findall(r'\b(?:[A-Z]+(?:, [A-Z]+)?(?: [A-Z]+)*)\b', remaining)
    instructors = []
    for instructor in instructors_match:
        if "TBA" not in instructor:  # Exclude "TBA"
            instructors.append(instructor)
        else:
            break

    instructors = ', '.join(instructors) if instructors else None  # Join multiple names with commas

    # Extract remaining text after the last instructor for Remarks
    if instructors:
        remaining_after_instructors = remaining.split(instructors, maxsplit=1)[-1].strip()
    else:
        remaining_after_instructors = remaining

    # Match Remarks starting with specific keywords
    remarks_match = re.search(r'\b(Hybrid|Most Online|AFA Type.*)\b', remaining_after_instructors)
    if remarks_match:
        remarks = remarks_match.group(0).strip()
    else:
        remarks = remaining_after_instructors.strip() if remaining_after_instructors else None

    return pd.Series([schedule, instructors, remarks])

base_url = "https://crs.upd.edu.ph/schedule/120241/{}"
all_rows = []
headers = None

for letter in ['A']: # string.ascii_uppercase:   for all letters ##########
    rows, headers = fetch_table_data(letter, base_url, headers)
    if rows:
        all_rows.extend(rows)

df = pd.DataFrame(all_rows)

df["Available Slots / Total Slots"] = df["Available Slots / Total Slots"].str.replace(r'\s+', ' ', regex=True).str.strip() # fix formatting

df[['Available Slots', 'Total Slots']] = df['Available Slots / Total Slots'].str.split(' / ', expand=True) # create two new columns

df['Section'] = df['Class'].str.split().str[-1] # add last word from Class into Section
df['Course'] = df['Class'].str.extract(r'^([\D]*\d+)') # add all words before number and the number itself, into Course

# Apply the function to split the column
df[['Schedule', 'Instructor(s)', 'Remarks']] = df['Schedule Instructor(s) Remarks'].apply(split_schedule_instructors_remarks)

# Drop the original column
df = df.drop(columns=['Schedule Instructor(s) Remarks','Letter'], errors='ignore')

# df = df.drop(columns=["Schedule Instructor(s) Remarks", "Letter", "Raw Data"], errors="ignore") # drop combined columns

print("\nUpdated DataFrame (first 5 rows):")
print(df.head())

Fetching data for letter: A from URL: https://crs.upd.edu.ph/schedule/120241/A
Table headers: ['Class Code', 'Class', 'Credits', 'Schedule Instructor(s) Remarks', 'Enlisting Unit : Block Block Remarks', 'Available Slots / Total Slots', 'Demand', 'Restrictions']

Updated DataFrame (first 5 rows):
  Class Code        Class Credits Enlisting Unit : Block Block Remarks  \
0      57852  AI 201 HZZQ     3.0                                  AIP   
1      57851  AI 201 TZZQ     3.0                                  AIP   
2      57854  AI 211 MZZQ     3.0                                  AIP   
3      57853   AI 211 WFZ     3.0                                  AIP   
4      57864  AI 221 TZZQ     3.0                                  AIP   

  Available Slots / Total Slots Demand Restrictions Raw Data Available Slots  \
0                        0 / 21      0     For: AIP      NaN               0   
1                        2 / 20      0     For: AIP      NaN               2   
2               OV

In [None]:
# Save DataFrame as CSV directly to Drive
df.to_csv(output_csv, index=False)
print(f"\nData saved to '{output_csv}'.")

Mounted at /content/drive

Data saved to '/content/drive/My Drive/crs_schedule_A_to_Z.csv'.
