In [1]:
import pandas as pd

def extract_table(table):
    # Extract rows, skipping the header row
    rows = table.find_all('tr')
    # Iterate through each row to get course information
    course_data = []
    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 13:  # Ensure there are enough columns
            crn = cells[1].get_text(strip=True)
            course_number = cells[2].get_text(strip=True)
            title = cells[4].get_text(strip=True)
            credits = cells[6].get_text(strip=True)
            day_time_room = cells[7].get_text(strip=True)
            instructor = cells[8].get_text(strip=True)
            fy = cells[9].get_text(strip=True)
            max_enroll = cells[10].get_text(strip=True)
            current_enroll = cells[11].get_text(strip=True)
            waitlist = cells[12].get_text(strip=True)
            
            course_data.append([crn, course_number, title, credits, day_time_room, instructor, fy, max_enroll, current_enroll, waitlist])

    # Create a DataFrame from the course data
    header_values = ['CRN', 'Course Number', 'Title', 'Credits', 'Day/Time/Room', 'Instructor', 'FY', 'Max', 'Current', 'Wait']
    df = pd.DataFrame(course_data, columns=header_values)
    # Drop rows where any of the columns contain header values
    #df_cleaned = df[~df.isin(header_values).any(axis=1)]

    return df

In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the course guide
url = "https://portal.simons-rock.edu/cg/Fall2024CourseGuide.php"

# Send a GET request to fetch the content of the page
response = requests.get(url)
tb_idex = 0
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table containing the courses
    tables = soup.find_all('table')[1:]
    # Iterate through tables to extract course details
    for table in tables:
        df = extract_table(table)
        df.to_csv(str(tb_idex)+'.csv')
        tb_idex +=1
else:
    print("Failed to retrieve the page content.")


In [3]:
import os

current_directory = os.getcwd()
csv_files = [file for file in os.listdir(current_directory) if file.endswith('.csv')]
merged_df = None
for csv_file in csv_files:
    file_path = os.path.join(current_directory, csv_file)  # Full path to the file
    df = pd.read_csv(file_path, index_col=0)  # Read the CSV file into a DataFrame
    if merged_df is None:
        merged_df = df  # Initialize with the first DataFrame
    else:
        merged_df =pd.concat([merged_df, df], ignore_index=True)
 
header_values = ['CRN', 'Course Number', 'Title', 'Credits', 'Day/Time/Room', 'Instructor', 'FY', 'Max', 'Current', 'Wait']
df_cleaned = merged_df[~merged_df.isin(header_values).any(axis=1)]
df_cleaned.to_csv('dataset1.csv')