#All events in a single file

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Base URL
BASE_URL = "https://www.essexstudent.com"

def scrape_event_cards():
    """Fetch and parse event details from the main events page."""
    try:
        response = requests.get(BASE_URL + "/whatson/")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Initialize list to store event details
        events = []

        # Find all event cards
        event_cards = soup.find_all('div', class_='event_item')
        print(f"Found {len(event_cards)} event cards")

        # Get page title for all events
        page_title = soup.find('title').get_text(strip=True) if soup.find('title') else ''

        for card in event_cards:
            event_details = {
                'page_title': page_title,
                'title': '',
                'date': '',
                'location': '',
                'description': '',
                'ticket_details': 'N/A'  # Ticket details not available in cards
            }

            # Extract details from the card's details div
            details_div = card.find('dl')
            if details_div:
                # Title
                title = details_div.find('a', class_='msl_event_name')
                event_details['title'] = title.get_text(strip=True) if title else ''


                # Date and Time
                time = details_div.find('dd', class_='msl_event_time')
                if time:
                    time_text = time.get_text(strip=True)
                    event_details['date'] = time_text  # Store full text in date
                    event_details['time'] = ''  # Leave time empty as it's combined

                # Location
                location = details_div.find('dd', class_='msl_event_location')
                event_details['location'] = location.get_text(strip=True) if location else ''

                # Description
                description = details_div.find('dd', class_='msl_event_description')
                event_details['description'] = description.get_text(strip=True) if description else ''

            events.append(event_details)

        return events
    except requests.RequestException as e:
        print(f"Error fetching main page: {e}")
        return []

def save_to_csv(events):
    """Save event details to a CSV file."""
    with open('events.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Page Title', 'Title', 'Date and Time', 'Location', 'Description', 'Ticket Details']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for event in events:
            writer.writerow({
                'Page Title': event['page_title'],
                'Title': event['title'],
                'Date and Time': event['date'],
                'Location': event['location'],
                'Description': event['description'],
                'Ticket Details': event['ticket_details']
            })

def main():
    # Scrape event details from the main page
    events = scrape_event_cards()

    # Save to CSV
    if events:
        save_to_csv(events)
        print(f"Saved {len(events)} events to events.csv")
    else:
        print("No events found or error occurred")

if __name__ == "__main__":
    main()

Found 97 event cards
Saved 97 events to events.csv


# Clean Data
- Merge recurring events
--- Single titles
--- Multiple dates, multiple venues

- Separate date from time
-- Synonymise dates


- Create a composite description column

In [None]:
import pandas as pd
import re
from datetime import datetime

# Read the CSV file
df = pd.read_csv('events.csv')

# Step 1: Merge recurring events by Title
def merge_recurring_events(df):
    # Group by Title and aggregate necessary fields
    merged_df = df.groupby('Title').agg({
        'Date and Time': lambda x: '; '.join(x),
        'Location': lambda x: ', '.join(sorted(set(x.dropna()))),  # Unique venues, comma-separated
        'Description': 'first',  # Take only the first description
        'Ticket Details': 'first',  # Take only the first ticket details
        'Page Title': 'first'
    }).reset_index()
    return merged_df

df = merge_recurring_events(df)

# Step 2: Separate date from time and aggregate them
def parse_date_time(date_time_str):
    dates = []
    times = []
    # Split by semicolon first to handle multiple date-time entries
    date_time_entries = date_time_str.split(';')

    for entry in date_time_entries:
        entry = entry.strip()
        # Handle multi-day events (e.g., "20th May noon - 22nd May midnight")
        if '-' in entry:
            date_times = entry.split(' - ')
            start = date_times[0].strip()
            end = date_times[1].strip() if len(date_times) > 1 else None
            # Extract date and time for start
            start_date_match = re.match(r'(\d{1,2}(?:st|nd|rd|th)? \w+)(.*)', start)
            if start_date_match:
                start_date, start_time = start_date_match.groups()
                dates.append(start_date.strip())
                times.append(start_time.strip() if start_time else '')
            # Extract date and time for end, if present
            if end:
                end_date_match = re.match(r'(\d{1,2}(?:st|nd|rd|th)? \w+)(.*)', end)
                if end_date_match:
                    end_date, end_time = end_date_match.groups()
                    dates.append(end_date.strip())
                    times.append(end_time.strip() if end_time else '')
        else:
            # Handle single-day events
            date_time_match = re.match(r'(\d{1,2}(?:st|nd|rd|th)? \w+)(.*)', entry)
            if date_time_match:
                date, time = date_time_match.groups()
                dates.append(date.strip())
                times.append(time.strip() if time else '')

    return dates, times

# Apply date-time parsing and aggregate
def aggregate_dates_times(df):
    date_time_info = df['Date and Time'].apply(parse_date_time)
    df['Dates'] = date_time_info.apply(lambda x: ', '.join([d for d in x[0] if d]))
    df['Times'] = date_time_info.apply(lambda x: ', '.join([t for t in x[1] if t]))
    return df

df = aggregate_dates_times(df)

# Step 3: Standardize dates
def standardize_date(date_str):
    if not date_str:
        return None
    try:
        # Remove ordinal indicators (st, nd, rd, th)
        date_str = re.sub(r'(\d{1,2})(st|nd|rd|th)', r'\1', date_str)
        # Parse date assuming year is 2025
        parsed_date = datetime.strptime(date_str + ' 2025', '%d %B %Y')
        # Get day as integer
        day = parsed_date.day
        # Determine ordinal suffix
        if 10 <= day % 100 <= 20:
            suffix = 'th'
        else:
            suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(day % 10, 'th')
        # Format date as "day ordinal month year" (e.g., 2nd June 2025)
        return parsed_date.strftime(f'%d{suffix} %B %Y')
    except:
        return None

# Apply standardization to each date in the Dates column
df['Dates'] = df['Dates'].apply(lambda x: ', '.join([standardize_date(d) for d in x.split(', ') if d]))

# Step 4: Create composite description column
def create_composite_description(row):
    components = [row['Title']]
    if pd.notna(row['Description']) and row['Description']:
        components.append(row['Description'])
    if pd.notna(row['Ticket Details']) and row['Ticket Details']:
        components.append(row['Ticket Details'])
    return ' | '.join(components)

df['Composite Description'] = df.apply(create_composite_description, axis=1)

# Drop unnecessary columns
df = df.drop(columns=['Date and Time', 'Ticket Details'])

# Reorder columns for clarity
df = df[['Title', 'Composite Description', 'Location', 'Dates', 'Times', 'Page Title']]

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_events.csv', index=False)

# Display the first few rows of the cleaned dataframe
print(df.head())

                                  Title  \
0                        7th SU Council   
1            A Rebel's Guide to Gramsci   
2                         All Out Essex   
3              Alwan Society Take Over!   
4  Anti-Social & Introverts Social Club   

                               Composite Description Location  \
0  7th SU Council | Elected representatives from ...            
1  A Rebel's Guide to Gramsci | The Italian Marxi...            
2  All Out Essex | An end-of-year talent showcase...            
3  Alwan Society Take Over! | Create your event, ...            
4  Anti-Social & Introverts Social Club | A low-k...            

                           Dates           Times  Page Title  
0   20th May 2025, 22nd May 2025  6:30pm, 1:15pm         NaN  
1                  13th May 2025             7pm         NaN  
2                  14th May 2025          6:30pm         NaN  
3                  24th May 2025             9pm         NaN  
4  15th May 2025, 10th June 2025   

#Events by month

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import re
import os

# Base URL
BASE_URL = "https://www.essexstudent.com"

def parse_month(date_string):
    """Extract the month name from a date string."""
    # Common date formats: "January 15, 2025", "15 Jan 2025", "Jan 15, 2025"
    try:
        # Replace multiple spaces and normalize
        date_string = re.sub(r'\s+', ' ', date_string.strip())
        # Try parsing with different formats
        for fmt in (
            "%B %d, %Y", "%d %B %Y", "%b %d, %Y", "%d %b %Y",
            "%B %d %Y", "%d %B, %Y", "%b %d %Y", "%d %b, %Y"
        ):
            try:
                date_obj = datetime.strptime(date_string, fmt)
                return date_obj.strftime("%B_%Y")  # e.g., "January_2025"
            except ValueError:
                continue
        # Fallback: look for month names or abbreviations
        month_names = (
            r'january|february|march|april|may|june|july|august|september|october|november|december|'
            r'jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec'
        )
        match = re.search(month_names, date_string.lower())
        if match:
            month = match.group()
            # Map abbreviations to full month names
            month_map = {
                'jan': 'January', 'feb': 'February', 'mar': 'March', 'apr': 'April',
                'may': 'May', 'jun': 'June', 'jul': 'July', 'aug': 'August',
                'sep': 'September', 'oct': 'October', 'nov': 'November', 'dec': 'December'
            }
            full_month = month_map.get(month.lower(), month.capitalize())
            # Try to extract year
            year_match = re.search(r'\d{4}', date_string)
            year = year_match.group() if year_match else datetime.now().strftime("%Y")
            return f"{full_month}_{year}"
        return None
    except Exception as e:
        print(f"Error parsing date '{date_string}': {e}")
        return None

def scrape_event_cards():
    """Fetch and parse event details from the main events page."""
    try:
        response = requests.get(BASE_URL + "/whatson/")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Initialize list to store event details
        events = []

        # Find all event cards
        event_cards = soup.find_all('div', class_='event_item')
        print(f"Found {len(event_cards)} event cards")

        # Get page title for all events
        page_title = soup.find('title').get_text(strip=True) if soup.find('title') else ''

        for card in event_cards:
            event_details = {
                'page_title': page_title,
                'title': '',
                'date': '',
                'location': '',
                'description': '',
                'ticket_details': 'N/A',
                'month_year': ''  # To store parsed month and year
            }

            # Extract details from the card's details div
            details_div = card.find('dl')
            if details_div:
                # Title
                title = details_div.find('a', class_='msl_event_name')
                event_details['title'] = title.get_text(strip=True) if title else ''

                # Date and Time
                time = details_div.find('dd', class_='msl_event_time')
                if time:
                    time_text = time.get_text(strip=True)
                    event_details['date'] = time_text
                    event_details['month_year'] = parse_month(time_text) or 'Unknown'

                # Location
                location = details_div.find('dd', class_='msl_event_location')
                event_details['location'] = location.get_text(strip=True) if location else ''

                # Description
                description = details_div.find('dd', class_='msl_event_description')
                event_details['description'] = description.get_text(strip=True) if description else ''

            events.append(event_details)

        return events
    except requests.RequestException as e:
        print(f"Error fetching main page: {e}")
        return []

def save_to_csv(events):
    """Save event details to separate CSV files by month."""
    # Group events by month_year
    events_by_month = {}
    for event in events:
        month_year = event['month_year']
        if month_year not in events_by_month:
            events_by_month[month_year] = []
        events_by_month[month_year].append(event)

    # Ensure output directory exists
    os.makedirs('events_by_month', exist_ok=True)

    # Save each month's events to a separate CSV
    fieldnames = ['Page Title', 'Title', 'Date and Time', 'Location', 'Description', 'Ticket Details']
    for month_year, month_events in events_by_month.items():
        filename = f"events_by_month/events_{month_year}.csv"
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for event in month_events:
                writer.writerow({
                    'Page Title': event['page_title'],
                    'Title': event['title'],
                    'Date and Time': event['date'],
                    'Location': event['location'],
                    'Description': event['description'],
                    'Ticket Details': event['ticket_details']
                })
        print(f"Saved {len(month_events)} events to {filename}")

def main():
    # Scrape event details from the main page
    events = scrape_event_cards()

    # Save to CSV by month
    if events:
        save_to_csv(events)
        print(f"Processed {len(events)} events into monthly CSV files")
    else:
        print("No events found or error occurred")

if __name__ == "__main__":
    main()

Found 118 event cards
Saved 1 events to events_by_month/events_April_2025.csv
Saved 61 events to events_by_month/events_May_2025.csv
Saved 42 events to events_by_month/events_June_2025.csv
Saved 8 events to events_by_month/events_July_2025.csv
Saved 4 events to events_by_month/events_August_2025.csv
Saved 2 events to events_by_month/events_September_2025.csv
Processed 118 events into monthly CSV files
