In [2]:
import os
import re
import pandas as pd
import csv
import numpy as np
import requests
from bs4 import BeautifulSoup

In [12]:
# Function to extract title from HTML
def extract_title(soup):
    return soup.title.string if soup.title else ''

# Function to extract meta description from HTML
def extract_meta_description(soup):
    meta_tag = soup.find('meta', attrs={'name': 'description'})
    return meta_tag.get('content', '') if meta_tag else ''

# Function to extract text from specific tags in HTML
def extract_tag_content(soup, tag):
    return ' '.join([element.get_text(" ", strip=True) for element in soup.find_all(tag)])

# Function to extract date and location information from HTML
def extract_date_location(soup):
    date = ''
    location = ''
    
    # Example: Extracting date from a specific tag with a class attribute
    date_tag = soup.find('div', class_='field-hs-event-date')
    if date_tag:
        date_label = date_tag.find('div', class_='field-label')
        if date_label:
            date_label.decompose()  # Remove the label
        date = date_tag.get_text(" ", strip=True)
    
    # Example: Extracting location from a specific tag with a class attribute
    location_tag = soup.find('div', class_='field-hs-event-location')
    if location_tag:
        location_label = location_tag.find('div', class_='field-label')
        if location_label:
            location_label.decompose()  # Remove the label
        location = location_tag.get_text(" ", strip=True)
    
    return date, location

# Function to extract all useful text from HTML and return as separate content
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        title = extract_title(soup)
        meta_description = extract_meta_description(soup)
        h1_content = extract_tag_content(soup, 'h1')
        h2_content = extract_tag_content(soup, 'h2')
        h3_content = extract_tag_content(soup, 'h3')
        h4_content = extract_tag_content(soup, 'h4')
        h5_content = extract_tag_content(soup, 'h5')
        h6_content = extract_tag_content(soup, 'h6')
        
        # Remove unwanted sections by specific class or id
        unwanted_sections = soup.select(
            '#connect, #block-humsci-colorful-hs-contactus, #block-humsci-colorful-hsaffiliationfooter, \
             .field-hs-event-contact-email, .field-hs-event-contact-phone, .field-hs-event-link'
        )
        for unwanted in unwanted_sections:
            unwanted.decompose()
        
        # Extracting paragraph content without specific sections like "About Us"
        paragraphs = soup.find_all('p')
        unwanted_keywords = ["About Us", "Contact", "Make a Gift", "Subscribe to the CESTA Newsletter",
                             "Our Team", "Affiliated Faculty and Researchers", "Affiliated Graduate Students",
                             "YouTube Channel", "Facebook", "Twitter", "Instagram", "Campus Map", "Muwekma Ohlone Tribe"]
        main_paragraphs = []
        for paragraph in paragraphs:
            paragraph_text = paragraph.get_text(" ", strip=True)
            if not any(keyword in paragraph_text for keyword in unwanted_keywords):
                main_paragraphs.append(paragraph_text)
        p_content = ' '.join(main_paragraphs)
        
        date, location = extract_date_location(soup)
        
        return title, meta_description, h1_content, h2_content, h3_content, h4_content, h5_content, h6_content, p_content, date, location

# Path to the HTML file you want to test
file_path = "/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/raw_data/1706600474-15.html"

# Extract the text
title, meta_description, h1_content, h2_content, h3_content, h4_content, h5_content, h6_content, p_content, date, location = extract_text_from_html(file_path)

# Print the extracted content
print(f"Title: {title}")
print(f"Meta Description: {meta_description}")
print(f"H1 Content: {h1_content}")
print(f"H2 Content: {h2_content}")
print(f"H3 Content: {h3_content}")
print(f"H4 Content: {h4_content}")
print(f"H5 Content: {h5_content}")
print(f"H6 Content: {h6_content}")
print(f"Paragraph Content: {p_content}")
print(f"Date: {date}")
print(f"Location: {location}")


Title: CESTA Digital Humanities Research Showcase 2023 | Center for Spatial and Textual Analysis
Meta Description: Every year, CESTA's Digital Humanities Fellows Program supports graduate students developing humanities research projects that apply computational tools and methods. Attend this day-long showcase to hear directly from this year's cohort about the projects they pursued through the program. Many will present together with the undergraduate students with whom they collaborated. 
H1 Content: CESTA Digital Humanities Research Showcase 2023
H2 Content: Events About People Connect With Us Contact Us
H3 Content: 
H4 Content: 
H5 Content: 
H6 Content: 
Paragraph Content: Every year, CESTA's Digital Humanities Fellows Program supports graduate students developing humanities research projects that apply computational tools and methods. Attend this day-long showcase to hear directly from this year's cohort about the projects they pursued through the program. Many will present together

In [None]:
html_pages = "/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/raw_data"