In [4]:
import os
import re
import pandas as pd
import csv
import numpy as np
import requests
from bs4 import BeautifulSoup

In [5]:
# Function to extract title from HTML
def extract_title(soup):
    return soup.title.string if soup.title else ''

# Function to extract meta description from HTML
def extract_meta_description(soup):
    meta_tag = soup.find('meta', attrs={'name': 'description'})
    return meta_tag.get('content', '') if meta_tag else ''

# Function to extract text from specific tags in HTML
def extract_tag_content(soup, tag):
    return ' '.join([element.get_text(" ", strip=True) for element in soup.find_all(tag)])

# Function to extract date and location information from HTML
def extract_date_location(soup):
    date = ''
    location = ''
    
    # Example: Extracting date from a specific tag with a class attribute
    date_tag = soup.find('div', class_='field-hs-event-date')
    if date_tag:
        date_label = date_tag.find('div', class_='field-label')
        if date_label:
            date_label.decompose()  # Remove the label
        date = date_tag.get_text(" ", strip=True)
    
    # Example: Extracting location from a specific tag with a class attribute
    location_tag = soup.find('div', class_='field-hs-event-location')
    if location_tag:
        location_label = location_tag.find('div', class_='field-label')
        if location_label:
            location_label.decompose()  # Remove the label
        location = location_tag.get_text(" ", strip=True)
    
    return date, location

# Function to extract all useful text from HTML and return as separate content
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        title = extract_title(soup)
        meta_description = extract_meta_description(soup)
        h1_content = extract_tag_content(soup, 'h1')
        h2_content = extract_tag_content(soup, 'h2')
        h3_content = extract_tag_content(soup, 'h3')
        h4_content = extract_tag_content(soup, 'h4')
        h5_content = extract_tag_content(soup, 'h5')
        h6_content = extract_tag_content(soup, 'h6')
        
        # Remove unwanted sections by specific class or id
        unwanted_sections = soup.select(
            '#connect, #block-humsci-colorful-hs-contactus, #block-humsci-colorful-hsaffiliationfooter, \
             .field-hs-event-contact-email, .field-hs-event-contact-phone, .field-hs-event-link'
        )
        for unwanted in unwanted_sections:
            unwanted.decompose()
        
        # Extracting paragraph content without specific sections like "About Us"
        paragraphs = soup.find_all('p')
        unwanted_keywords = ["About Us", "Contact", "Make a Gift", "Subscribe to the CESTA Newsletter",
                             "Our Team", "Affiliated Faculty and Researchers", "Affiliated Graduate Students",
                             "YouTube Channel", "Facebook", "Twitter", "Instagram", "Campus Map", "Muwekma Ohlone Tribe"]
        main_paragraphs = []
        for paragraph in paragraphs:
            paragraph_text = paragraph.get_text(" ", strip=True)
            if not any(keyword in paragraph_text for keyword in unwanted_keywords):
                main_paragraphs.append(paragraph_text)
        p_content = ' '.join(main_paragraphs)
        
        date, location = extract_date_location(soup)
        
        return title, meta_description, h1_content, h2_content, h3_content, h4_content, h5_content, h6_content, p_content, date, location

# Path to the HTML file you want to test
file_path = "/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/events/event_html/1716323825-87.html"

# Extract the text
title, meta_description, h1_content, h2_content, h3_content, h4_content, h5_content, h6_content, p_content, date, location = extract_text_from_html(file_path)

# Print the extracted content
print(f"Title: {title}")
print(f"Meta Description: {meta_description}")
print(f"H1 Content: {h1_content}")
print(f"H2 Content: {h2_content}")
print(f"H3 Content: {h3_content}")
print(f"H4 Content: {h4_content}")
print(f"H5 Content: {h5_content}")
print(f"H6 Content: {h6_content}")
print(f"Paragraph Content: {p_content}")
print(f"Date: {date}")
print(f"Location: {location}")


Title: Computation and Culture in the Era of Digital Media: a transdisciplinary roundtable | Center for Spatial and Textual Analysis
Meta Description: How do recent developments in Natural Language Processing (NLP), particularly in the area of large language models, challenge existing disciplinary specializations, practices, and boundaries? How does it change how we understand culture, literature and communication? How might future advances in NLP change our relationship to language, narrative, meaning, and one another? A transdisciplinary roundtable will consider these questions and more. The roundtable will feature:
H1 Content: Computation and Culture in the Era of Digital Media: a transdisciplinary roundtable
H2 Content: Events About People Connect With Us Contact Us
H3 Content: 
H4 Content: 
H5 Content: 
H6 Content: 
Paragraph Content: How do recent developments in Natural Language Processing (NLP), particularly in the area of large language models, challenge existing disciplinary 

In [6]:
# Function to process all HTML files in a folder and save to CSV
def process_html_files(folder_path, output_csv):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.html'):
            print(file_name)
            file_path = os.path.join(folder_path, file_name)
            title, meta_description, h1_content, h2_content, h3_content, h4_content, h5_content, h6_content, p_content, date, location = extract_text_from_html(file_path)
            data.append({
                'File Name': file_name,
                'Title': title,
                'Meta Description': meta_description,
                'H1 Content': h1_content,
                'H2 Content': h2_content,
                'H3 Content': h3_content,
                'H4 Content': h4_content,
                'H5 Content': h5_content,
                'H6 Content': h6_content,
                'Paragraph Content': p_content,
                'Date': date,
                'Location': location
            })
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['File Name', 'Title', 'Meta Description', 'H1 Content', 'H2 Content', 'H3 Content', 'H4 Content', 'H5 Content', 'H6 Content', 'Paragraph Content', 'Date', 'Location']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

# Path to the folder containing HTML files
folder_path = "/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/events/event_html"
# Path to the output CSV file
output_csv = "/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/events/event_data.csv"

# Process the HTML files and save to CSV
process_html_files(folder_path, output_csv)

print("Data extraction complete. Check the CSV file for results.")

1716323825-158.html
1716323825-271.html
1716323825-334.html
1716323825-226.html
1716323825-363.html
1716323825-119.html
1716323825-230.html
1716323825-375.html
1716323825-267.html
1716323825-288.html
1716323825-322.html
1716323825-135.html
1716323825-359.html
1716323825-60.html
1716323825-162.html
1716323825-37.html
1716323825-174.html
1716323825-318.html
1716323825-21.html
1716323825-123.html
1716323825-99.html
1716323825-76.html
1716323825-292.html
1716323825-338.html
1716323825-154.html
1716323825-380.html
1716323825-56.html
1716323825-103.html
1716323825-40.html
1716323825-396.html
1716323825-379.html
1716323825-400.html
1716323825-115.html
1716323825-17.html
1716323825-284.html
1716323825-142.html
1716323825-83.html
1716323825-355.html
1716323825-210.html
1716323825-139.html
1716323825-302.html
1716323825-247.html
1716323825-181.html
1716323825-314.html
1716323825-251.html
1716323825-178.html
1716323825-197.html
1716323825-343.html
1716323825-95.html
1716323825-206.html
1716323825