In [None]:
from statsmodels.stats.power import TTestIndPower

# Define parameters for power analysis
alpha = 0.05  # Significance level
power = 0.8   # Desired power (usually 0.8 or 80%)
effect_size = 0.5  # Medium effect size (can be adjusted based on your expectations)

# Initialize power analysis object
analysis = TTestIndPower()

# Calculate required sample size
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, alternative='two-sided')

print(f"Required Sample Size per group: {int(sample_size)}")



# 1. Scrape data - Music Culture & Sports

In [None]:

import requests
from bs4 import BeautifulSoup
import json
import re
from typing import List, Dict


def scrape_data(urls: List[str]) -> Dict[str, str]:
    scraped_data = {}

    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract text content from paragraphs
            paragraphs = [p.get_text() for p in soup.find_all('p')]
            text_content = '\n'.join(paragraphs)

            # Extract table data
            tables = soup.find_all('table')
            for i, table in enumerate(tables):
                table_rows = table.find_all('tr')
                table_content = []
                for row in table_rows:
                    cells = row.find_all(['td', 'th'])
                    cell_text = [cell.get_text(strip=True) for cell in cells]
                    table_content.append('\t'.join(cell_text))
                text_content += '\nTable {}:\n{}'.format(i + 1, '\n'.join(table_content))

            divs = soup.find_all('div')
            for i, div in enumerate(divs):
                div_content = div.get_text(strip=True)
                if div_content:  # Only include non-empty divs
                    text_content += f"\nDiv {i + 1}:\n{div_content}"

            # Extract list data (ordered and unordered lists)
            lists = soup.find_all(['ul', 'ol'])
            for i, list_tag in enumerate(lists):
                list_items = [li.get_text(strip=True) for li in list_tag.find_all('li')]
                text_content += '\nList {}:\n{}'.format(i + 1, '\n'.join(list_items))

            # Extract header data (H1-H6)
            headers = soup.find_all(re.compile('h[1-6]'))
            for header in headers:
                text_content += f"\n{header.name.upper()} : {header.get_text(strip=True)}"

            # Store scraped data for the URL
            scraped_data[url] = text_content

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    return scraped_data


def clean_data(text: str) -> str:
    # Remove extra whitespaces, newlines, and special characters
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces into one
    text = re.sub(r'[\n\r\t]', ' ', text)  # Remove newlines, tabs
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.strip()
    return text


def save_to_json(data: Dict[str, str], file_path: str):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Sport

In [None]:
# Sports. If needed, add schedule of penguin, lookup 2025 schedule for steelers
sport_urls = ['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/', 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-steelers/',
        'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-pirates/', 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-penguins/',
        'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/college-sports/', 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/other-sports-teams/',
        'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/terrible-towel/', 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/pittsburgh-riverhounds/',
        'https://www.visitpittsburgh.com/events-festivals/annual-events/','https://www.mlb.com/pirates', 'https://www.steelers.com/',
        'https://www.nhl.com/penguins/', 'https://www.mlb.com/pirates/schedule/2025/fullseason' ]

In [None]:
raw_data = scrape_data(sport_urls)

In [None]:
# 2. data clean

In [None]:
# cleaned_sports = [clean_data(t) for t in raw_data.values()]
cleaned_sports = {url: clean_data(text) for url, text in raw_data.items()}

In [None]:
sport_urls[9]

'https://www.mlb.com/pirates'

In [None]:
# manual check - Problematic : 9, 12
cleaned_sports[sport_urls[9]]

# Fix scrape func to solve 9
# manual paste to fill 12

"Episode 1: Liover Peguero Episode 2: Andrew McCutchen  Div 1: Tickets2025 Ticket Info & Schedule2025 Spring TrainingSingle Game TicketsMini PlansSeason TicketsSpecial Ticket EventsGroup TicketsFan Value DealsPittsburgh Baseball Club LevelPremium SeatingHospitality AreasMy Pirates TicketsDigital Ticketing3D Seating MapPNC Park ToursConcerts & EventsMLB Ballpark AppBuy & Sell Tickets on SeatGeekPremiumSchedule2025 Spring Training Schedule2025 Regular Season SchedulePromotions ScheduleSortable SchedulePrintable ScheduleDownloadable ScheduleBroadcast ScheduleMLB EventsScoresStatsTeam StatsTop Prospect StatsAffiliate StatsBaseball SavantRoster40-Man RosterNon-Roster InviteesDepth ChartStarting LineupsCoachesTransactionsInjury UpdatesDraft ResultsFront OfficeBroadcastersPlayer SearchMinor League AffiliatesTop 30 ProspectsVideoRun It BackPirates CharitiesPirates Game RecapMost PopularPhotosNewsLatest HeadlinesOfficial ReleasesProbable PitchersGame NotesPirates PipelineRSS News FeedMLB NewsPN

In [None]:
cleaned_sports[12] = 'Mar 27 Thu Marlins home opener @ Marlins 4:10 pm EDT, Mar 28 Fri @ Marlins 7:10 pm EDT, Mar 29 Sat @ Marlins 4:10 pm EDT, Mar 30 Sun @ Marlins 1:40 pm EDT, Mar 31 Mon @ Rays 7:05 pm EDT, Apr 1 Tue @ Rays 7:05 pm EDT, Apr 2 Wed @ Rays 1:10 pm EDT, Apr 4 Fri Pirates home opener vs. Yankees 4:12 pm EDT, Apr 5 Sat vs. Yankees 4:05 pm EDT, Apr 6 Sun vs. Yankees 1:35 pm EDT, Apr 7 Mon vs. Cardinals 6:40 pm EDT, Apr 8 Tue vs. Cardinals 6:40 pm EDT, Apr 9 Wed vs. Cardinals 12:35 pm EDT, Apr 11 Fri @ Reds 6:40 pm EDT, Apr 12 Sat @ Reds 6:40 pm EDT, Apr 13 Sun @ Reds 1:40 pm EDT, Apr 14 Mon vs. Nationals 6:40 pm EDT, Apr 15 Tue Jackie Robinson Day vs. Nationals 6:40 pm EDT, Apr 16 Wed vs. Nationals 6:40 pm EDT, Apr 17 Thu vs. Nationals 12:35 pm EDT Sugardale Dollar Dog Game, Apr 18 Fri vs. Guardians 6:40 pm EDT Zambelli Fireworks Night #1, Apr 19 Sat vs. Guardians 4:05 pm EDT Bobblehead Night #1 - Paul Skenes, Apr 20 Sun vs. Guardians 1:35 pm EDT, Apr 22 Tue @ Angels 9:38 pm EDT, Apr 23 Wed @ Angels 9:38 pm EDT, Apr 24 Thu @ Angels 9:29 pm EDT, Apr 25 Fri @ Dodgers 10:10 pm EDT, Apr 26 Sat @ Dodgers 9:10 pm EDT, Apr 27 Sun @ Dodgers 4:10 pm EDT, Apr 29 Tue vs. Cubs 6:40 pm EDT, Apr 30 Wed vs. Cubs 6:40 pm EDT, May 1 Thu vs. Cubs 12:35 pm EDT Sugardale Dollar Dog Game, May 2 Fri vs. Padres 6:40 pm EDT Watch on: Apple TV+, May 3 Sat vs. Padres 4:05 pm EDT Free T-Shirt Giveaway, May 4 Sun vs. Padres 1:35 pm EDT Youth Baseball & Softball Day, May 5 Mon @ Cardinals 7:45 pm EDT, May 6 Tue @ Cardinals 7:45 pm EDT, May 7 Wed @ Cardinals 1:15 pm EDT, May 9 Fri vs. Braves 6:40 pm EDT Negro League Legacy Night, May 10 Sat vs. Braves 4:05 pm EDT WWE Night, May 11 Sun vs. Braves 1:35 pm EDT Kids Jared Triolo Gold Glove Figurine, May 12 Mon @ Mets 7:10 pm EDT, May 13 Tue @ Mets 7:10 pm EDT, May 14 Wed @ Mets 7:10 pm EDT, May 16 Fri @ Phillies 6:45 pm EDT, May 17 Sat @ Phillies 6:05 pm EDT, May 18 Sun @ Phillies 1:35 pm EDT, May 19 Mon vs. Reds 6:40 pm EDT Watch on: FS1, May 20 Tue vs. Reds 6:40 pm EDT, May 21 Wed vs. Reds 12:35 pm EDT, May 22 Thu vs. Brewers 6:40 pm EDT Sugardale Dollar Dog Game, May 23 Fri vs. Brewers 6:40 pm EDT Zambelli Fireworks Night #2, May 24 Sat vs. Brewers 4:05 pm EDT Pirates Short Sleeve Hoodie, May 25 Sun vs. Brewers 1:35 pm EDT Kids Andrew McCutchen Headband, May 26 Mon @ D-backs 8:10 pm EDT, May 27 Tue @ D-backs 9:40 pm EDT, May 28 Wed @ D-backs 3:40 pm EDT, May 30 Fri @ Padres 9:40 pm EDT, May 31 Sat @ Padres 9:40 pm EDT, Jun 1 Sun @ Padres 5:10 pm EDT, Jun 3 Tue vs. Astros 6:40 pm EDT, Jun 4 Wed vs. Astros 6:40 pm EDT, Jun 5 Thu vs. Astros 6:40 pm EDT Sugardale Dollar Dog Game, Jun 6 Fri vs. Phillies 6:40 pm EDT Miller Lite Pregame Happy Hour, Jun 7 Sat vs. Phillies 4:05 pm EDT Pregame Block Party, Jun 8 Sun vs. Phillies 1:35 pm EDT Eat\'n Park Smiley Cookie Giveaway, Jun 9 Mon vs. Marlins 6:40 pm EDT Watch on: FS1, Jun 10 Tue vs. Marlins 6:40 pm EDT Youth Baseball & Softball Day, Jun 11 Wed vs. Marlins 12:35 pm EDT, Jun 12 Thu @ Cubs 8:05 pm EDT, Jun 13 Fri @ Cubs 2:20 pm EDT, Jun 14 Sat @ Cubs 2:20 pm EDT, Jun 15 Sun @ Cubs 2:20 pm EDT, Jun 17 Tue @ Tigers 6:40 pm EDT, Jun 18 Wed @ Tigers 6:40 pm EDT, Jun 19 Thu @ Tigers 1:10 pm EDT, Jun 20 Fri vs. Rangers 6:40 pm EDT Zambelli Fireworks Night #3, Jun 21 Sat vs. Rangers 4:05 pm EDT Watch on: FS1 Pirates Cap, Jun 22 Sun vs. Rangers 1:35 pm EDT Superman Day, Jun 23 Mon @ Brewers 7:40 pm EDT, Jun 24 Tue @ Brewers 7:40 pm EDT, Jun 25 Wed @ Brewers 2:10 pm EDT, Jun 27 Fri vs. Mets 6:40 pm EDT Postgame Zambelli Drone Show #1, Jun 28 Sat vs. Mets 4:05 pm EDT Free T-Shirt Giveaway, Jun 29 Sun vs. Mets 1:35 pm EDT Kids Replica Mitch Keller Jersey, Jun 30 Mon vs. Cardinals 6:40 pm EDT, Jul 1 Tue vs. Cardinals 6:40 pm EDT, Jul 2 Wed vs. Cardinals 12:35 pm EDT, Jul 4 Fri @ Mariners 4:10 pm EDT, Jul 5 Sat @ Mariners 10:10 pm EDT Watch on: FS1, Jul 6 Sun @ Mariners 4:10 pm EDT, Jul 7 Mon @ Royals 7:40 pm EDT, Jul 8 Tue @ Royals 7:40 pm EDT, Jul 9 Wed @ Royals 7:40 pm EDT, Jul 11 Fri @ Twins 8:10 pm EDT, Jul 12 Sat @ Twins 2:10 pm EDT, Jul 13 Sun @ Twins 2:10 pm EDT, Jul 18 Fri vs. White Sox 6:40 pm EDT Yinzerpalooza Weekend, Jul 19 Sat vs. White Sox 6:40 pm EDT Yinzerpalooza Weekend, Jul 20 Sun vs. White Sox 1:35 pm EDT Yinzerpalooza Weekend, Jul 21 Mon vs. Tigers 6:40 pm EDT, Jul 22 Tue vs. Tigers 6:40 pm EDT, Jul 23 Wed vs. Tigers 12:35 pm EDT, Jul 25 Fri vs. D-backs 6:40 pm EDT Free T-Shirt Giveaway, Jul 26 Sat vs. D-backs 6:40 pm EDT Postgame Zambelli Drone Show #2, Jul 27 Sun vs. D-backs 1:35 pm EDT Kids Topps Baseball Card Set, Jul 28 Mon @ Giants 9:45 pm EDT, Jul 29 Tue @ Giants 9:45 pm EDT, Jul 30 Wed @ Giants 3:45 pm EDT, Aug 1 Fri @ Rockies 8:10 pm EDT, Aug 2 Sat @ Rockies 3:10 pm EDT, Aug 3 Sun @ Rockies 3:10 pm EDT, Aug 4 Mon vs. Giants 6:40 pm EDT, Aug 5 Tue vs. Giants 6:40 pm EDT, Aug 6 Wed vs. Giants 12:35 pm EDT, Aug 7 Thu vs. Reds 6:40 pm EDT Sugardale Dollar Dog Game, Aug 8 Fri vs. Reds 6:40 pm EDT Bucco Luau Weekend, Aug 9 Sat vs. Reds 6:40 pm EDT Bucco Luau Weekend, Aug 10 Sun vs. Reds 1:35 pm EDT Bucco Luau Weekend, Aug 11 Mon @ Brewers 7:40 pm EDT, Aug 12 Tue @ Brewers 7:40 pm EDT, Aug 13 Wed @ Brewers 2:10 pm EDT, Aug 15 Fri @ Cubs 2:20 pm EDT, Aug 16 Sat @ Cubs 2:20 pm EDT Watch on: FS1, Aug 17 Sun @ Cubs 2:20 pm EDT, Aug 18 Mon vs. Blue Jays 6:40 pm EDT, Aug 19 Tue vs. Blue Jays 6:40 pm EDT, Aug 20 Wed vs. Blue Jays 12:35 pm EDT, Aug 22 Fri vs. Rockies 6:40 pm EDT Zambelli Fireworks Night #5, Aug 23 Sat vs. Rockies 6:40 pm EDT Free T-Shirt Giveaway, Aug 24 Sun vs. Rockies 12:05 pm EDT Watch on: Roku Sesame Street Day, Aug 25 Mon @ Cardinals 7:45 pm EDT, Aug 26 Tue @ Cardinals 7:45 pm EDT, Aug 27 Wed @ Cardinals 7:45 pm EDT Watch on: FS1, Aug 28 Thu @ Cardinals 2:15 pm EDT, Aug 29 Fri @ Red Sox 7:10 pm EDT, Aug 30 Sat @ Red Sox 4:10 pm EDT, Aug 31 Sun @ Red Sox 1:35 pm EDT, Sep 2 Tue vs. Dodgers 6:40 pm EDT, Sep 3 Wed vs. Dodgers 6:40 pm EDT, Sep 4 Thu vs. Dodgers 6:40 pm EDT Sugardale Dollar Dog Game, Sep 5 Fri vs. Brewers 6:40 pm EDT Free T-Shirt Giveaway, Sep 6 Sat vs. Brewers 6:40 pm EDT Bobblehead Night #3, Sep 7 Sun vs. Brewers 1:35 pm EDT, Sep 9 Tue @ Orioles 6:35 pm EDT, Sep 10 Wed @ Orioles 6:35 pm EDT, Sep 11 Thu @ Orioles 1:05 pm EDT, Sep 12 Fri @ Nationals 6:45 pm EDT, Sep 13 Sat @ Nationals 4:05 pm EDT, Sep 14 Sun @ Nationals 1:35 pm EDT, Sep 15 Mon vs. Cubs 6:40 pm EDT Clemente Day, Sep 16 Tue vs. Cubs 6:40 pm EDT, Sep 17 Wed vs. Cubs 12:35 pm EDT, Sep 19 Fri vs. Athletics 6:40 pm EDT Zambelli Fireworks Night #6, Sep 20 Sat vs. Athletics 6:40 pm EDT Fan Appreciation Weekend, Sep 21 Sun vs. Athletics 1:35 pm EDT Fan Appreciation Weekend, Sep 23 Tue @ Reds 6:40 pm EDT, Sep 24 Wed @ Reds 6:40 pm EDT, Sep 25 Thu @ Reds 12:40 pm EDT, Sep 26 Fri @ Braves 7:15 pm EDT, Sep 27 Sat @ Braves 7:15 pm EDT, Sep 28 Sun @ Braves 3:15 pm EDT'

In [None]:
# write to json
save_to_json(cleaned_sports, "cleaned_sport.json")

# Music

In [None]:
music_urls = ['https://www.pittsburghsymphony.org/', 'https://www.pittsburghsymphony.org/calendar',
             'https://www.pittsburghsymphony.org/calendar?page=2', 'https://www.pittsburghsymphony.org/calendar?page=3',
             'https://www.pittsburghsymphony.org/calendar?page=4', 'https://www.pittsburghsymphony.org/calendar?page=5',
             'https://www.pittsburghsymphony.org/calendar?page=6', 'https://www.pittsburghsymphony.org/pso_home/web/musicians',
              'https://www.pittsburghsymphony.org/pso_home/web/visit-landing/directions-parking-lodging',
              'https://pittsburghopera.org/about/mission-history', 'https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743393600000',
              'https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1743480000000&end=1746053940000',
              'https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1746072000000&end=1748732340000',
              'https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1751342400000&end=1754002740000',
              'https://pittsburghopera.org/tickets', 'https://trustarts.org/pct_home/about', 'https://trustarts.org/pct_home/about/cultural-district',
              'https://trustarts.org/calendar?order_by=production&genre=All%20Genres&organization_id=1&start_date=2025/03/20&end_date=2025/12/31',
              'https://trustarts.org/calendar?end_date=2025%2F12%2F31&genre=All+Genres&order_by=production&organization_id=1&page=2&start_date=2025%2F03%2F20',
              'https://trustarts.org/calendar?end_date=2025%2F12%2F31&genre=All+Genres&order_by=production&organization_id=1&page=3&start_date=2025%2F03%2F20',
              'https://trustarts.org/calendar?end_date=2025%2F12%2F31&genre=All+Genres&order_by=production&organization_id=1&page=4&start_date=2025%2F03%2F20',
              'https://trustarts.org/calendar?end_date=2025%2F12%2F31&genre=All+Genres&order_by=production&organization_id=1&page=5&start_date=2025%2F03%2F20',
              'https://trustarts.org/calendar?end_date=2025%2F12%2F31&genre=All+Genres&order_by=production&organization_id=1&page=6&start_date=2025%2F03%2F20',
              'https://trustarts.org/pct_home/visual-arts#upcoming'
              ]

In [None]:
raw_music = scrape_data(music_urls)

Error scraping https://www.pittsburghsymphony.org/: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/
Error scraping https://www.pittsburghsymphony.org/calendar: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/calendar
Error scraping https://www.pittsburghsymphony.org/calendar?page=2: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/calendar?page=2
Error scraping https://www.pittsburghsymphony.org/calendar?page=3: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/calendar?page=3
Error scraping https://www.pittsburghsymphony.org/calendar?page=4: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/calendar?page=4
Error scraping https://www.pittsburghsymphony.org/calendar?page=5: 403 Client Error: Forbidden for url: https://www.pittsburghsymphony.org/calendar?page=5
Error scraping https://www.pittsburghsymphony.org/calendar?page=6: 403 Client Error: Forbidden for url: https://ww

In [None]:
raw_music['https://trustarts.org/pct_home/visual-arts#upcoming'] = """
The Galleries at the Pittsburgh Cultural Trust present exhibitions of contemporary art by regional, national, and international artists. The Trust currently operates five distinct exhibition spaces in downtown Pittsburgh’s 14-block Cultural District including Wood Street Galleries, a historic exhibition space dedicated to new and emerging media.

The Galleries at the Pittsburgh Cultural Trust are FREE and open to the public.

CURRENT EXHIBITIONS   UPCOMING EXHIBITIONS   OUR GALLERIES   JOIN OUR EMAIL LIST
Gallery Hours
Wednesday - Sunday: 11am - 5pm
Closed 1 - 1:30pm

Current Exhibitions
Monument Eternal: Le’Andra LeSeur
Sat, Feb 1 - Sat, May 17, 2025
Wood Street Galleries
Pittsburgh Cultural Trust
Film  Free

The End That Never Was: Carnegie Mellon University 1st and 2nd Year MFA Exhibition
Sat, Feb 15 - Sun, Apr 27, 2025
SPACE Gallery
Pittsburgh Cultural Trust
Free  Visual Arts

Melike Konur: Women I've Been
Sun, Mar 9 - Sun, Jul 20, 2025
820 Gallery
Pittsburgh Cultural Trust
Visual Arts
Public Art
Since 1984, the Pittsburgh Cultural Trust has commissioned ambitious and transformative temporary and permanent public art projects throughout the downtown Pittsburgh area that engage with artists from broad disciplines.

A Sudden Gust of Wind
Fri, Apr 26 - Mon, Mar 31, 2025
Pittsburgh Cultural Trust

Thaddeus Mosley
Thu, Aug 29 - Sun, Aug 31, 2025
Pittsburgh Cultural Trust
Free  Visual Arts

Upcoming Exhibitions
Juried Visual Art Exhibition 2025
Fri, May 16 - Sun, Aug 3, 2025
SPACE Gallery
Pittsburgh Cultural Trust
Free  Visual Arts

Celebrating 45 Years of CAPA: Alumni and Faculty Exhibition
Fri, Aug 29 - Sun, Jan 25, 2026
SPACE Gallery
Pittsburgh Cultural Trust
Free  Visual Arts
"""

In [None]:
raw_music['https://trustarts.org/calendar/1'] = """
1. Event Name: Vermiglio
   Date: Fri, Mar 7 - Thu, Mar 20, 2025
   Venue: Harris Theater
   Organization: Pittsburgh Cultural Trust
   Type: Film

2. Event Name: Universal Language
   Date: Fri, Mar 14 - Thu, Mar 20, 2025
   Venue: Harris Theater
   Organization: Pittsburgh Cultural Trust
   Type: Film

3. Event Name: Byham Theater Tour
   Date: Wed, Sep 4 - Mon, Apr 28, 2025
   Venue: Byham Theater
   Organization: Pittsburgh Cultural Trust
   Type: Film

4. Event Name: Spotlight on the ‘Burgh: Whose Trick is it Anyway?
   Date: Wed, Mar 19 - Sat, Mar 29, 2025
   Venue: Liberty Magic
   Organization: Pittsburgh Cultural Trust Presents
   Type: Family

5. Event Name: Shamrock Tenors
   Date: Thu, Mar 20, 2025
   Venue: Greer Cabaret Theater
   Organization: Dentons Cohen & Grigsby TRUST PRESENTS Series
   Type: Live Music Concert

6. Event Name: On Becoming a Guinea Fowl
   Date: Fri, Mar 21 - Thu, Apr 3, 2025
   Venue: Harris Theater
   Organization: Pittsburgh Cultural Trust
   Type: Film

7. Event Name: Jeremy Piven Live
   Date: Fri, Mar 21, 2025
   Venue: Byham Theater
   Organization: Outback Presents
   Type: Comedy Talks & Poetry

8. Event Name: Jeremy Piven VIP Meet & Greet
   Date: Fri, Mar 21, 2025
   Venue: Byham Theater
   Organization: Outback Presents
   Type: Comedy

9. Event Name: fireWALL Dance: On the Run
   Date: Sat, Mar 22, 2025
   Venue: Greer Cabaret Theater
   Organization: fireWALL Dance Theater
   Type: Dance

10. Event Name: Mr. Messado's School of Magic for the Young and Young at Heart
    Date: Sun, Sep 1 - Sun, Mar 23, 2025
    Venue: Liberty Magic
    Organization: Pittsburgh Cultural Trust
    Type: Family

11. Event Name: Desi Banks: The Elevation Tour
    Date: Sun, Mar 23, 2025
    Venue: Byham Theater
    Organization: Outback Presents
    Type: Comedy

12. Event Name: Jim Henson’s Fraggle Rock: Back to the Rock LIVE!
    Date: Thu, Mar 27, 2025
    Venue: Byham Theater
    Organization: Dentons Cohen & Grigsby TRUST PRESENTS Series
    Type: Family

13. Event Name: The Yellowjackets
    Date: Thu, Mar 27, 2025
    Venue: Greer Cabaret Theater
    Organization: Pittsburgh Cultural Trust
    Type: Live Music

14. Event Name: International Art House Classics: Picnic at Hanging Rock (1975)
    Date: Thu, Mar 27 - Wed, Apr 2, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

15. Event Name: Eephus
    Date: Fri, Mar 28 - Thu, Apr 3, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

16. Event Name: Danae Hays: The First Time Tour
    Date: Fri, Mar 28, 2025
    Venue: Byham Theater
    Organization: Icon Concerts
    Type: Comedy

17. Event Name: Danae Hays VIP Package
    Date: Fri, Mar 28, 2025
    Venue: Byham Theater
    Organization: Icon Concerts
    Type: Comedy

18. Event Name: Storm Large: Inside Voice
    Date: Sat, Mar 29, 2025
    Venue: Greer Cabaret Theater
    Organization: Pittsburgh Cultural Trust
    Type: Concert

19. Event Name: Steel City Horror Show
    Date: Sun, Mar 30, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

20. Event Name: Peter Pan
    Date: Tue, Apr 1 - Sun, Apr 6, 2025
    Venue: Benedum Center
    Organization: PNC Broadway in Pittsburgh
    Type: Broadway & Musical Theater

21. Event Name: Creative Conversations —Peter Pan
    Date: Wed, Apr 2, 2025
    Venue: Trust Arts Education Center
    Organization: Pittsburgh Cultural Trust
    Type: Broadway & Musical Theater Free

22. Event Name: Malin Nilsson in Magic: Unplugged
    Date: Wed, Apr 2 - Sun, May 4, 2025
    Venue: Liberty Magic
    Organization: Pittsburgh Cultural Trust
    Type: Magic

23. Event Name: Oceans Are the Real Continents
    Date: Fri, Apr 4 - Wed, Apr 9, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

24. Event Name: Art Spiegelman: Disaster Is My Muse
    Date: Sat, Apr 5 - Wed, Apr 9, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

25. Event Name: 2025-2026 Trust Season Announcement
    Date: Mon, Apr 7, 2025
    Venue: Benedum Center
    Organization: Pittsburgh Cultural Trust
    Type: Live Music Concert

26. Event Name: The Thorn
    Date: Tue, Apr 8 - Wed, Apr 9, 2025
    Venue: Benedum Center
    Organization: Outback Presents
    Type: Drama

27. Event Name: Jerry Seinfeld Live
    Date: Thu, Apr 10, 2025
    Venue: Benedum Center
    Organization: JS Touring LLC
    Type: Comedy

28. Event Name: Showcase of Brazilian Film: Pictures of Ghosts (2023)
    Date: Fri, Apr 11, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

29. Event Name: Brit Floyd
    Date: Fri, Apr 11 - Sat, Apr 12, 2025
    Venue: Benedum Center
    Organization: WDVE Presents
    Type: Live Music Concert

30. Event Name: Showcase of Brazilian Film: Elis & Tom (2022)
    Date: Sat, Apr 12, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

31. Event Name: Stefon Harris & Blackout
    Date: Sat, Apr 12, 2025
    Venue: Greer Cabaret Theater
    Organization: Pittsburgh Cultural Trust
    Type: Live Music

32. Event Name: Showcase of Brazilian Film: Black God, White Devil (1964)
    Date: Sat, Apr 12, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

33. Event Name: Showcase of Brazilian Film: Executive Order (2021)
    Date: Sun, Apr 13, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

34. Event Name: Showcase of Brazilian Film: No More History Without Us (2025)
    Date: Sun, Apr 13, 2025
    Venue: Harris Theater
    Organization: Pittsburgh Cultural Trust
    Type: Film

35. Event Name: Some Like It Hot
    Date: Tue, Apr 15 - Sun, Apr 20, 2025
    Venue: Benedum Center
    Organization: PNC Broadway in Pittsburgh
    Type: Broadway & Musical Theater

36. Event Name: Creative Conversations — Some Like it Hot
    Date: Wed, Apr 16, 2025
    Venue: Trust Arts Education Center
    Organization: Pittsburgh Cultural Trust
    Type: Broadway & Musical Theater Free
"""

In [None]:
raw_music['https://trustarts.org/calendar/2'] = """Twyla Tharp Dance
Sat, Apr 19, 2025
Byham Theater
Pittsburgh Dance Council
Ballet & Dance

Bob Dylan: Rough and Rowdy Ways Tour
Mon, Apr 21, 2025
Benedum Center
AEG Presents
Live Music  Concert

Don't Let the Pigeon Drive the Bus! The Musical!
Sat, Apr 26, 2025
Byham Theater
Dentons Cohen & Grigsby Children's Theater Series
Family

Shen Yun
Sat, Apr 26 - Sun, Apr 27, 2025
Benedum Center
The Greater Philadelphia Falun Dafa Association
Live Music  Ballet & Dance

Benedum Theater Tour
Tue, Feb 25 - Mon, Apr 28, 2025
Benedum Center
Pittsburgh Cultural Trust

Stavros Halkias: The Dreamboat Tour
Thu, May 1, 2025
Benedum Center
Drusky Entertainment
Comedy

Come From Away
Fri, May 2 - Sun, May 4, 2025
Benedum Center
PNC Broadway in Pittsburgh
Broadway & Musical Theater

Kaiju Big Battel: Breakfast of Champions
Sat, May 3, 2025
Byham Theater
Dentons Cohen & Grisgby TRUST PRESENTS Series

Bruce Hornsby and yMusic present BrhyM
Sun, May 4, 2025
Byham Theater
Dentons Cohen & Grigsby TRUST PRESENTS Series
Live Music  Concert

Patina Miller
Mon, May 5, 2025
Greer Cabaret Theater
Pittsburgh Cultural Trust Cabaret Series
Concert

Joshua Jay in Making Magic
Wed, May 7 - Sun, Jun 1, 2025
Liberty Magic
Pittsburgh Cultural Trust

Malandain Ballet Biarritz
Wed, May 7, 2025
Byham Theater
Pittsburgh Dance Council
Ballet & Dance

Trevor Wallace: The Alpha Beta Male
Fri, May 9, 2025
Byham Theater
Live Nation
Comedy

Trevor Wallace Post Show Meet & Greet Add-on
Fri, May 9, 2025
Byham Theater
Live Nation

ARTEMIS
Sat, May 10, 2025
Greer Cabaret Theater
Pittsburgh Cultural Trust
Live Music

Herb Alpert & The Tijuana Brass & Other Delights
Mon, May 12, 2025
Byham Theater
Martin Media Presents
Live Music  Concert

360 ALLSTARS
Wed, May 14, 2025
Byham Theater
Dentons Cohen & Grigsby Children's Theater Series
Family

123 Andrés
Sat, May 17, 2025
Byham Theater
Dentons Cohen & Grigsby Children's Theater Series
Family

CANCELLED - The Lord of the Rings & The Hobbit In Concert
Sun, May 25, 2025
Benedum Center
Star Entertainment
Concert  Film

CANCELLED - The Magical Music of Harry Potter Live In Concert
Sun, May 25, 2025
Benedum Center
Star Entertainment
Concert  Film

CANCELLED - The Music of Avatar: The Last Airbender in Concert
Sun, May 25, 2025
Benedum Center
Star Entertainment
Concert  Film

Trisha Paytas: The Eras of Trish Tour
Thu, Jun 5, 2025
Benedum Center
Outback Presents
Comedy

Sarah Millican: Late Bloomer
Wed, Jun 11, 2025
Byham Theater
Live Nation Presents
Comedy

DVE Comedy Festival
Sat, Jun 21, 2025
Byham Theater
The DVE Morning Show
Comedy

Jimmy Carr: Laughs Funny
Sun, Jun 22, 2025
Byham Theater
Live Nation
Comedy

Curious Creators: STEAM Studio Camp
Mon, Jul 7, 2025
Trust Arts Education Center
Trust Arts Education
Workshops & Classes

The Screwtape Letters
Sun, Jul 13, 2025
Byham Theater
Fellowship for Performing Arts
Comedy

Bridges & Brushstrokes: STEAM Studio Camp
Mon, Jul 14, 2025
Trust Arts Education Center
Trust Arts Education
Workshops & Classes

Tech Takeover: Backstage Bootcamp for Aspiring Crew Kids
Mon, Jul 14, 2025
Trust Arts Education Center
Trust Arts Education
Workshops & Classes

Josh Johnson: The Flowers Tour
Fri, Aug 15 - Sat, Aug 16, 2025
Byham Theater
Drusky Entertainment
Comedy

Killers of Kill Tony
Sat, Oct 4, 2025
Benedum Center
Outback Presents
Comedy

The Concert: A Tribute to ABBA
Wed, Oct 8, 2025
Byham Theater
Pittsburgh Cultural Trust
Live Music  Concert

Steve Martin & Martin Short
Fri, Nov 21 - Sat, Nov 22, 2025
Benedum Center
LME and Steve Litman Presents
Comedy
"""

In [None]:
raw_music['https://trustarts.org/pct_home/about/cultural-district'] = """The Cultural District, a project of the Pittsburgh Cultural Trust, is the epicenter for Downtown Pittsburgh’s dynamic arts and entertainment scene. Packed into a 14-square-block area — stretching from the Convention Center to Stanwix Street — are world-class theaters, engaging art galleries, inspiring public parks and art installations, renowned restaurants, and diverse retail stores.

These spaces are home to a year-round schedule full of fresh and exciting events, installations, and programming. Join millions of other visitors in discovering a multitude of choices for live entertainment, contemporary music, modern dance, visual art, and thought-provoking theater, as well as classical music, opera, ballet, popular musical theater, film, and more.

No matter the occasion — be it a family outing, romantic date night, or friend-filled excursion — the Cultural District offers something for everyone to enjoy. Overwhelmed by all the choices? Don’t worry, we can help you plan the perfect Cultural District visit.

The Pittsburgh Cultural Trust isn’t the only member company of the Cultural District. Other partners include the Pittsburgh Ballet Theatre, the Pittsburgh CLO, the Pittsburgh Opera, Pittsburgh Public Theater, the Pittsburgh Symphony, and the August Wilson African American Cultural Center.

The world of arts and culture doesn’t end there, as the city’s High School for Creative and Performing Arts also calls the District home. Other organizations that regularly use Cultural District theaters and spaces include Arcarde Comedy Theater, Bricolage,  and Pittsburgh Musical Theater."""

In [None]:
raw_music['https://trustarts.org/pct_home/about'] = """
Performing Arts. Visual Arts. Festivals. Arts Education. Urban Development.
Since 1984, the Pittsburgh Cultural Trust, a non-profit arts organization, has worked to make the Steel City a place where the arts can flourish. Our efforts have focused on the cultural and economic development of the Cultural District, a 14-square-block area of downtown Pittsburgh. What was once a downtrodden red light district now thrives as a vibrant center for culture, art, food, and community. Pittsburgh’s Cultural District stands as a nationwide model for how the arts can play a pivotal role in urban revitalization.

Each year, millions of people visit the Cultural District to expand their horizons in our theaters, galleries, and public art environments. Patrons enjoy thousands of world-class performing arts events and visual arts exhibitions. Pittsburgh residents of all ages connect and learn with the Trust’s comprehensive education and community engagement opportunities. Local arts organizations collaborate to build a stronger cultural community through the power of partnerships.
The Cultural District acts as the anchor for all of that work. The Trust’s superior venues and gallery spaces allow resident companies, community organizations, artists, and promoters to reach audiences large and small. In total, the Pittsburgh Cultural Trust manages more than one million square feet of real estate in the District. Step outside those buildings to find numerous public art installations that beautify Pittsburgh's largest arts neighborhood.

Lauded as “the single greatest creative force in Pittsburgh because of its spirit of reinvention” by the Pittsburgh Post Gazette, the Trust strives every day to enrich the city of Pittsburgh’s vibrancy, diversity, and prosperity.
"""

In [None]:
raw_music['https://www.pittsburghsymphony.org/pso_home/web/visit-landing/directions-parking-lodging'] = """directions
Directions
The Port Authority website includes a lot of helpful information on using public transit to get around Pittsburgh, including rider alerts, fees, schedules and maps, park and ride lots, a trip planner and more.

Get Public Transit Information →

Heinz Hall is an easy walk from both the Gateway Center and Wood Street stations of the T light rail system. Several bus lines also stop within easy walking distance of Heinz Hall.

Get Directions to Heinz Hall →

Parking
Heinz Hall is located within walking distance of more than 6,500 parking spaces in the Cultural District!  Season ticket holders have the first opportunity to purchase pre-paid guaranteed parking in the 6th and Penn garage across the street from Heinz Hall for just $18 per concert.

ParkPGH offers real time parking updates on available parking spots in downtown Pittsburgh parking garages.

Visit ParkPGH →
Get Parking information →

Lodging
fairmont
Fairmont Pittsburgh
Official Hotel

Website: www.fairmont.com/pittsburgh
510 Market Street, Pittsburgh
412.773.8800

Located at the heart of Pittsburgh's business, cultural and dining hub, Fairmont Pittsburgh offers superb and distinctive guest services and accommodations in a luxury setting.

The Pittsburgh Symphony Orchestra is proud to partner with Fairmont Pittsburgh as it provides luxury accommodations for PSO guest artists and conductors, including Music Director Manfred Honeck.

 elite
Elite Coach Transportation
Official Transportation Service

Please consider using Elite Coach Transportation for any coach bus, car, limousine, or van services. (800) 488-7775."""

In [None]:
raw_music['https://www.pittsburghsymphony.org/pso_home/web/musicians'] = """Musicians
First Violin
David McCarroll | Concertmaster | Rachel Mellon Walton Chair
Justine Campagna | Associate Concertmaster | Beverlynn & Steven Elliott Chair
Dylan Naroff | Assistant Concertmaster
Marta Krechkovsky | Assistant Concertmaster | Michael F. Butler Memorial Chair
Kelsey Blumenthal
Ellen Chen-Livingston | Selma Wiener Berkman Memorial Chair
Irene Cheng | Dr. & Mrs. William E. Rinehart Chair
Sarah Clendenning | Lois R. Brozenick Memorial Chair
Alison Peters Fujito
Ilkhom Mukhiddinov
Jennifer Orchard
Susanne Park | Dr. Alan & Marsha Bramowitz Chair
Kristina Yoder
Shannon Fitzhenry | 24-25 Season Musician

Second Violin
Jeremy Black | Principal | G. Christian Lantzsch & Duquesne Light Company Chair
Louis Lev | Associate Principal |The Morrison Family Chair
Dennis O'Boyle | Assistant Principal
Laura Motchalov | William & Sarah Galbraith Chair
Andrew Fuller
Lorien Benet Hart | Arlyn Gilboa Chair
Yeokyung Kim
Boxianzi Vivian Ling
Claudia Mahave | Alice Victoria Gelormino Chair
Cecee Pantikian
Regi Papa
Carolyn Semes
Yingchen Zhang

Viola
Tatjana Mead Chamis | Acting Principal | Jon & Carol Walton Associate Principal Viola Chair
Joen Vasquez | Acting Associate Principal
Marylène Gingras-Roy* | Acting Assistant Principal
Laura Fuller
Sean Juhl
Erina Laraby-Goldwasser
Aaron Mossburg
Stephanie Tretick
Andrew Wickesberg | Mr. and Mrs. Martin G. McGuinn Chair
Rimbo Wong | 24-25 Season Musician
Si Yu | 24-25 Season Musician


Cello
Anne Martindale Williams | Principal | Pittsburgh Symphony Association Chair
Dale Jeong | Associate Principal | Donald I. & Janet Moritz and Equitable Resources, Inc. Chair
Adam Liu | Assistant Principal | George & Eileen Dorman Chair
Mikhail Istomin | Susan Candace Hunt Chair
Bronwyn Banerdt | Sissons & Snapp Family Chair
Michael DeBruyn | Jane & Rae Burton Chair
Alexandra Lee | William Block Memorial Chair
Yun-Ya Lo
Charlie Powers | HaleyFesq Cello Chair
Karissa Shivone

Bass
Nicholas Myers | Principal
Brandon McLean | Associate Principal
Joseph Campagna
Jeffrey Grubbs | Michael & Carol Bleier Chair
Peter Guild
Micah Howard | Stephen & Kimberly Keen Chair
John Moore
Aaron White
Drew Collins | Paul J. Ross Fellow

Harp
Gretchen Van Hoesen | Principal | Virginia Campbell Chair

Flute
Lorna McGhee* | Principal | Jackman Pfouts Flute Chair
Jennifer Steele | Hilda M. Willis Foundation Chair
Yevgeny Faniuk | 24-25 Season Musician | Acting Associate Principal

Piccolo
Rhian Kenny | Principal | Frank & Loti Gaffney Chair

Oboe
Cynthia Koledo DeAlmeida | Principal | Dr. William Larimer Mellon, Jr. Chair
Max Blair | Associate Principal
Samuel Nemec

English Horn
Ian Woodworth | 24-25 Season Musician

Clarinet
Michael Rusinek | Principal | Mr. & Mrs. Aaron Silberman Chair
Victoria Luperi | Associate Principal
Ron Samuels | Sidney Stark, Jr. Memorial Chair

Eb Clarinet
Victoria Luperi | Principal

Bass Clarinet
Jack Howell | Principal | Mr. and Mrs. Willard J. Tillotson, Jr. Chair

Bassoon
David Sogg | Acting Principal
Philip A. Pandolfi
Carlos Clark | Paul J. Ross Fellow

Contrabassoon
James Rodgers | Principal

Horn
William Caballero | Principal | Anonymous Donor Chair
Stephen Kostyniak | Associate Principal | The Hotopp Family Chair
Zachary Smith | Assistant Principal
Michelle Hembree
Mark Houghton
Robert Lauver
Landon Young | Paul J. Ross Fellow

Trumpet
Micah Wilkinson | Principal | Martha Brooks Robinson Principal Trumpet Chair
Conrad Jones | Associate Principal
Neal Berntsen
Chad Winkler | Susan S. Greer Memorial Chair
Joshua Carr | Paul J. Ross Fellow

Trombone
Peter Sullivan | Principal | Tom & Jamee Todd Chair
Douglas F. Rosenthal | Associate Principal
James Nova | Ann McGuinn Trombone Chair

Bass Trombone
Jeffrey Dee | Principal | William & Jacqueline Herbein Principal Bass Trombone Chair

Tuba
Craig Knox | Principal | Dr. Mary Ann Craig Chair

Timpani
James Benoit | Principal | Barbara Weldon Chair
Christopher Allen | Associate Principal

Percussion
Jeremy Branson | Acting Principal
Christopher Allen
Shawn Galvin | 24-25 Season Musician | Acting Associate Principal

Keyboard
Rodrigo Ojeda | Mr. & Mrs. Benjamin F. Jones III Guest Keyboard Chair

Librarians
Lisa Gedris | Principal | Jean & Sigo Falk Chair
Sheryl Hadeka | Assistant Librarian | Anonymous Fund of The Pittsburgh Foundation Chair

* On Leave"""

In [None]:
raw_music['https://www.pittsburghsymphony.org/calendar'] = """
1. Event Name: Lift Every Voice
   Date: Sat, Mar 15, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

2. Event Name: Lang Lang with the PSO
   Date: Wed, Mar 19, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Concert

3. Event Name: Kanneh-Mason Performs Shostakovich
   Date: Fri, Mar 21 - Sun, Mar 23, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

4. Event Name: PSO360: Soul of the Cello
   Date: Sat, Mar 22, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

5. Event Name: Speakers Series: Chris Wallace
   Date: Wed, Mar 26, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Talks & Poetry

6. Event Name: Beethoven’s Pastoral
   Date: Fri, Mar 28 - Sun, Mar 30, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

7. Event Name: Student Side-by-Side
   Date: Wed, Apr 2, 2025
   Venue: See Event Description
   Organization: Pittsburgh Symphony Orchestra
   Type: Discovery & Drinks: Music & Cinema 2

8. Event Name: Total Eclipse of the Chart: Music of the 80s
   Date: Thu, Apr 3, 2025
   Venue: See Event Description
   Organization: Pittsburgh Symphony Orchestra
   Type: Discovery & Drinks: Music & Cinema 2

9. Event Name: Fiddlesticks: Imagine That!
   Date: Fri, Apr 4 - Sun, Apr 6, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

10. Event Name: Boy Band Symphony
    Date: Sat, Apr 5, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Live Music Concert Family

11. Event Name: Speakers Series: Andrew Lloyd Webber
    Date: Mon, Apr 7, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Live Music Concert

12. Event Name: Speakers Series: Andrew Lloyd Webber
    Date: Wed, Apr 9, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Talks & Poetry
"""


In [None]:
raw_music['https://www.pittsburghsymphony.org/calendar?end_date=2017%2F12%2F12&order_by=production&page=2'] =  """
1. Event Name: PSO Disrupt: Lovestruck
   Date: Thu, Apr 10, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

2. Event Name: Gerstein Plays Tchaikovsky
   Date: Fri, Apr 11 - Sun, Apr 13, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

3. Event Name: Maestro's Wine Dinner
   Date: Wed, Apr 23, 2025
   Venue: See Event Description
   Organization: Pittsburgh Symphony Orchestra
   Type: Beethoven and Brahms

4. Event Name: Beethoven and Brahms
   Date: Fri, Apr 25 - Sun, Apr 27, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

5. Event Name: Music of Star Wars
   Date: Sat, May 3, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

6. Event Name: Clouds in my Coffee: Music of Joni Mitchell, Carole King & Carly Simon
   Date: Fri, May 9 - Sun, May 11, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

7. Event Name: Bronfman Plays Beethoven
   Date: Fri, May 16 - Sun, May 18, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

8. Event Name: Brahms’ Fourth Symphony
   Date: Fri, May 30 - Sun, Jun 1, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

9. Event Name: Yo-Yo Ma
   Date: Wed, Jun 4, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

10. Event Name: Beethoven and Mahler
    Date: Fri, Jun 6 - Sun, Jun 8, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

11. Event Name: PSO360: Alice Sara Ott, piano
    Date: Sat, Jun 7, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

12. Event Name: Rana Plays Mendelssohn
    Date: Fri, Jun 13 - Sun, Jun 15, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music
"""


In [None]:
raw_music['https://www.pittsburghsymphony.org/calendar?end_date=2025%2F12%2F31&filter%5Bcurrent_page%5D=production&filter%5Bmax%5D=2026-09-12+14%3A22%3A02+-0400&filter%5Bmin%5D=2025-03-12T14%3A22%3A02-04%3A00&genre=All+Genres&organization_id=2&page=2&start_date=2025%2F03%2F20'] = """
1. Event Name: Rana Plays Mendelssohn
   Date: Fri, Jun 13 - Sun, Jun 15, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Classical Live Music

2. Event Name: Dolly Parton's Threads: My Songs in Symphony
   Date: Wed, Jun 18, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

3. Event Name: Adam Ray is Dr. Phil
   Date: Thu, Jun 19, 2025
   Venue: Heinz Hall
   Organization: Drusky Entertainment
   Type: Comedy

4. Event Name: Kings of Soul
   Date: Fri, Jun 20 - Sun, Jun 22, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

5. Event Name: The Lord of the Rings: The Fellowship of the Ring
   Date: Fri, Jun 27 - Sun, Jun 29, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Film

6. Event Name: Diana Krall
   Date: Wed, Jul 2, 2025
   Venue: Heinz Hall
   Organization: National Artists / Heinz Hall Presents
   Type: Live Music Concert

7. Event Name: Vince Gill
   Date: Sat, Jul 12, 2025
   Venue: Heinz Hall
   Organization: Outback Presents
   Type: Live Music Concert

8. Event Name: Marvel Infinity Saga
   Date: Sat, Aug 2 - Sun, Aug 3, 2025
   Venue: Heinz Hall
   Organization: Pittsburgh Symphony Orchestra
   Type: Live Music Concert

9. Event Name: Louis C.K.
   Date: Sat, Aug 23, 2025
   Venue: Heinz Hall
   Organization: Upfront Inc. Presents
   Type: Comedy

10. Event Name: Opening Night Gala
    Date: Sat, Sep 20, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Live Music Concert

11. Event Name: Dvořák's Eighth
    Date: Fri, Sep 26 - Sun, Sep 28, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

12. Event Name: Symphonie Fantastique
    Date: Fri, Oct 10 - Sun, Oct 12, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

13. Event Name: Sibelius' Finlandia
    Date: Fri, Oct 17 - Sun, Oct 19, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

14. Event Name: Disrupt 1
    Date: Sat, Oct 18, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

15. Event Name: Grimaud Plays Gershwin
    Date: Fri, Nov 7 - Sun, Nov 9, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

16. Event Name: Ferrández Plays Saint-Saëns
    Date: Fri, Nov 14 - Sun, Nov 16, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

17. Event Name: PSO360: Pablo Ferrández
    Date: Sat, Nov 15, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

18. Event Name: Himari Plays Bruch
    Date: Fri, Nov 28 - Sun, Nov 30, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

19. Event Name: Carnegie Hall Preview
    Date: Tue, Dec 2, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

20. Event Name: Shostakovich's Fifth
    Date: Fri, Dec 5 - Sun, Dec 7, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

21. Event Name: Messiah
    Date: Tue, Dec 9, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

22. Event Name: Sensory Friendly Concert: Holiday Pops
    Date: Sat, Dec 13, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music

23. Event Name: Holiday Brass Spectacular
    Date: Tue, Dec 16, 2025
    Venue: Heinz Hall
    Organization: Pittsburgh Symphony Orchestra
    Type: Classical Live Music
"""


In [None]:
raw_music['https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743393600000'] = """
Madama Butterfly Preview on WQED-FM 89.3 and wqed.org/fm
Friday, March 21, 2025
07:00 PM
WQED-FM 89.3 and wqed.org/fm

Pre-Opera Talk
Saturday, March 22, 2025
07:00 PM
Benedum Center

LGBTQ+/Ally Night
Saturday, March 22, 2025
07:00 PM
Benedum Center

MADAMA BUTTERFLY
Saturday, March 22, 2025
08:00 PM

Pre-Opera Talk
Tuesday, March 25, 2025
06:00 PM
Benedum Center

MADAMA BUTTERFLY
Tuesday, March 25, 2025
07:00

Meet the Artists - Madama Butterfly
Tuesday, March 25, 2025
10:00 PM
Benedum Center
View Details

MADAMA BUTTERFLY - STUDENT MATINEE
Thursday, March 27, 2025
10:15 AM - 01:00 PM
Benedum Center
View Details

Pre-Opera Talk
Friday, March 28, 2025
06:30 PM
Benedum Center
View Details

MADAMA BUTTERFLY
Friday, March 28, 2025
07:30 PM
View Details

Pre-Opera Talk
Sunday, March 30, 2025
01:00 PM
Benedum Center
View Details

MADAMA BUTTERFLY
Sunday, March 30, 2025
02:00 PM
"""

In [None]:
raw_music['https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1743480000000&end=1746053940000'] = """April Song Shop
Saturday, April 12, 2025

12:00 PM - 01:00 PM

Bitz Opera Factory

View Details

Woman With Eyes Closed Preview on WQED-FM 89.3 and wqed.org/fm
Saturday, April 19, 2025

12:30 PM

WQED-FM 89.3 and wqed.org/fm

View Details

Woman With Eyes Closed Preview on WQED-FM 89.3 and wqed.org/fm
Friday, April 25, 2025

07:00 PM

WQED-FM 89.3 and wqed.org/fm

View Details

WOMAN WITH EYES CLOSED
Saturday, April 26, 2025

08:00 PM

View Details

WOMAN WITH EYES CLOSED
Tuesday, April 29, 2025

07:00 PM

View Details

Meet the Artists - Woman with Eyes Close
Tuesday, April 29, 2025

09:00 PM

Bitz Opera Factory

View Details"""

In [None]:
raw_music['https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1746072000000&end=1748732340000']

'MADAMA BUTTERFLYIn this groundbreaking new production created by an all Japanese and Japanese American creative team, Madama Butterfly’s story is transported to a fantastical realm where reality and dreams intersect.\nBENEDUM CENTERMAR 22-30\nMADAMA BUTTERFLYIn this groundbreaking new production created by an all Japanese and Japanese American creative team, Madama Butterfly’s story is transported to a fantastical realm where reality and dreams intersect.\nBENEDUM CENTERMAR 22-30\nMADAMA BUTTERFLYIn this groundbreaking new production created by an all Japanese and Japanese American creative team, Madama Butterfly’s story is transported to a fantastical realm where reality and dreams intersect.\nBENEDUM CENTERMAR 22-30\nMADAMA BUTTERFLYIn this groundbreaking new production created by an all Japanese and Japanese American creative team, Madama Butterfly’s story is transported to a fantastical realm where reality and dreams intersect.\nBENEDUM CENTERMAR 22-30\nMADAMA BUTTERFLYIn this gro

In [None]:
raw_music['https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1746072000000&end=1748732340000'] = """
WOMAN WITH EYES CLOSED
Friday, May 2, 2025

07:30 PM

View Details

WOMAN WITH EYES CLOSED
Saturday, May 3, 2025

02:00 PM

Bitz Opera Factory

View Details

WOMAN WITH EYES CLOSED
Sunday, May 4, 2025

04:00 PM

View Details

Maecenas XL
Saturday, May 17, 2025

07:00 PM - 10:00 PM

Monterey Bay Fish Grotto

View Details"""

In [None]:
raw_music['https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1751342400000&end=1754002740000'] = """
July 1, 2025 to July 31, 2025
Bravo Academy Session 1
Monday, July 7, 2025 to Friday, July 11, 2025

09:00 AM - 04:00 PM

Bitz Opera Factory

View Details

Bravo Academy Session 2
Monday, July 14, 2025 to Friday, July 18, 2025

09:00 AM - 04:00 PM

Bitz Opera Factory

View Details"""

In [None]:
raw_music['https://pittsburghopera.org/tickets'] = """

Subscribe Today
Opus Pass
Group Tickets
Promotions and Discounts
Student Tickets
Give the Gift of Opera
Ticketing Info & Policies
Accessibility
Opera FAQs
Free Rideshare Vouchers
Free Childcare Services
Close

At Pittsburgh Opera, we offer something for everyone.
Tickets to Pittsburgh Opera’s fantastic season are available in a variety of ways:

Season Tickets (aka subscriptions): You may renew your season tickets or purchase a new subscription online 24/7 or by calling 412-281-0912 ext. 3 between 9AM-5PM Monday through Friday.
If you want unlimited opera at one low monthly price, check out our Opus Pass.
Single Tickets on sale now for the current 2024-25 Season!
Opera is more fun with friends - learn about the discounts available with our Group Sales
Student Tickets provide an affordable way for students to enjoy world-class opera performances
Click here for information about accessibility and wheelchair accommodations at the Bitz Opera Factory, or call 412-281-0912."""

In [None]:
# cleaned_sports = [clean_data(t) for t in raw_data.values()]
cleaned_music = {url: clean_data(text) for url, text in raw_music.items()}

In [None]:
music_urls = list(raw_music.keys())

In [None]:
len(music_urls)


15

In [None]:
raw_music[music_urls[14]]

"\nThe Galleries at the Pittsburgh Cultural Trust present exhibitions of contemporary art by regional, national, and international artists. The Trust currently operates five distinct exhibition spaces in downtown Pittsburgh’s 14-block Cultural District including Wood Street Galleries, a historic exhibition space dedicated to new and emerging media.\n\nThe Galleries at the Pittsburgh Cultural Trust are FREE and open to the public.\n\nCURRENT EXHIBITIONS   UPCOMING EXHIBITIONS   OUR GALLERIES   JOIN OUR EMAIL LIST\nGallery Hours\nWednesday - Sunday: 11am - 5pm\nClosed 1 - 1:30pm\n\nCurrent Exhibitions\nMonument Eternal: Le’Andra LeSeur\nSat, Feb 1 - Sat, May 17, 2025\nWood Street Galleries\nPittsburgh Cultural Trust\nFilm  Free\n\nThe End That Never Was: Carnegie Mellon University 1st and 2nd Year MFA Exhibition\nSat, Feb 15 - Sun, Apr 27, 2025\nSPACE Gallery\nPittsburgh Cultural Trust\nFree  Visual Arts\n\nMelike Konur: Women I've Been\nSun, Mar 9 - Sun, Jul 20, 2025\n820 Gallery\nPitts

In [None]:
cleaned_music[music_urls[2]]

'April Song Shop Saturday, April 12, 2025 12:00 PM - 01:00 PM Bitz Opera Factory View Details Woman With Eyes Closed Preview on WQED-FM 89.3 and wqed.org/fm Saturday, April 19, 2025 12:30 PM WQED-FM 89.3 and wqed.org/fm View Details Woman With Eyes Closed Preview on WQED-FM 89.3 and wqed.org/fm Friday, April 25, 2025 07:00 PM WQED-FM 89.3 and wqed.org/fm View Details WOMAN WITH EYES CLOSED Saturday, April 26, 2025 08:00 PM View Details WOMAN WITH EYES CLOSED Tuesday, April 29, 2025 07:00 PM View Details Meet the Artists - Woman with Eyes Close Tuesday, April 29, 2025 09:00 PM Bitz Opera Factory View Details'

In [None]:
save_to_json(cleaned_music, "cleaned_music.json")

# Museum

In [None]:
museum_urls = ['https://carnegiemuseums.org/', 'https://carnegiemuseums.org/things-to-do/explore-our-collections/',
               'https://carnegiemuseums.org/events/', 'https://carnegiemuseums.org/events/page/2/',
               'https://carnegiemuseums.org/events/page/3/', 'https://carnegiemuseums.org/plan-a-visit/',
               'https://carnegiemuseums.org/about-us/our-history/', 'https://carnegiemuseums.org/about-us/our-museums/',
               'https://www.heinzhistorycenter.org/', 'https://www.heinzhistorycenter.org/events/',
               'https://www.heinzhistorycenter.org/events/?query=&filters%5Bcategory%5D=&filters%5Blocation%5D=&filters%5Bmonth%5D=2025-04#filter_results',
               'https://www.heinzhistorycenter.org/events/?query=&filters%5Bcategory%5D=&filters%5Blocation%5D=&filters%5Bmonth%5D=2025-05#filter_results',
               'https://www.heinzhistorycenter.org/events/?query=&filters%5Bcategory%5D=&filters%5Blocation%5D=&filters%5Bmonth%5D=2025-06#filter_results',
               'https://www.heinzhistorycenter.org/whats-on/exhibits/', 'https://www.heinzhistorycenter.org/about/',
               'https://www.thefrickpittsburgh.org/', 'https://www.thefrickpittsburgh.org/exhibitions',
               'https://www.thefrickpittsburgh.org/plan-your-visit#visit']

In [None]:
raw_museum = scrape_data(museum_urls)

Error scraping https://carnegiemuseums.org/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/
Error scraping https://carnegiemuseums.org/things-to-do/explore-our-collections/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/things-to-do/explore-our-collections/
Error scraping https://carnegiemuseums.org/events/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/events/
Error scraping https://carnegiemuseums.org/events/page/2/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/events/page/2/
Error scraping https://carnegiemuseums.org/events/page/3/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/events/page/3/
Error scraping https://carnegiemuseums.org/plan-a-visit/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/plan-a-visit/
Error scraping https://carnegiemuseums.org/about-us/our-history/: 403 Client Error: Forbidden for url: https://carnegiemuseums.org/about-us/our-history/
Error scrap

In [None]:
raw_museum['https://www.heinzhistorycenter.org/about/'] = """
The Senator John Heinz History Center is Pennsylvania’s largest history museum and a proud affiliate of the Smithsonian Institution. Devoted to the history and heritage of Western Pennsylvania, our family of museums includes the Heinz History Center, Western Pennsylvania Sports Museum, Fort Pitt Museum, and Meadowcroft Rockshelter and Historic Village. The History Center is home to the Thomas & Katherine Detre Library & Archives.

Our mission is to engage and inspire large and diverse audiences through programs that enable links to the past, understanding in the present, and guidance for the future by preserving regional history and presenting the American experience with a Western Pennsylvania connection.

This work is accomplished in partnership with others through archaeology, archives, artifact collections, broadcast and electronic media, civic engagement, conservation, educational programs, exhibitions, events, library, museums, public outreach, performance, preservation, publications, products, research, technical assistance, and virtual programs.

Our Story
The Senator John Heinz History Center traces its roots back to 1879, making it the oldest cultural institution in Western Pennsylvania.

In 1879, the Old Residents of Pittsburgh and Western Pennsylvania established a historical society to help preserve local history. Five years later, the name changed to the Historical Society of Western Pennsylvania and has been in continuous existence for more than 135 years.

Originally, membership in the historical society was limited to men who had lived in the region for 50 years or more, though the rules changed within a few years to include women and younger people. In those days, members enjoyed lectures and country outings and got together to reminisce. Perhaps most importantly, early Historical Society members worked to preserve archival materials and objects of historic significance, forming the foundation of the History Center’s collections.

The Historical Society brought our region’s history to the public. In 1908, it celebrated the region’s 150th anniversary; in 1911, the centennial of steamboat navigation; in 1958, the region’s bicentennial celebration; and in 2008, the region’s 250th anniversary. These events served as the basis for the many events, publications, educational programs, and exhibitions that the History Center offers today.

Early meetings of the Historical Society were held in members’ homes and churches, though a significant step was taken in 1893 when the Carnegie Library of Pittsburgh at Schenley Park offered space for the archives. In 1914, after securing the funding, the Historical Society built its own building on Bigelow Blvd. In 1996, the History Center moved into its current home in Pittsburgh’s Strip District.

With the opening of the Smithsonian wing in 2004, the History Center became the largest history museum in Pennsylvania. The new wing allows better opportunities found in our affiliation with the Smithsonian Institution. The additional space added the Western Pennsylvania Sports Museum, the Mueller Education Center, the Special Collections Gallery, and the McGuinn Gallery for traveling exhibitions.

Since its opening, the Smithsonian wing has been home to various exhibitions, including Pennsylvania’s Civil War, Vatican Splendors, 1968: The Year that Rocked America, Pittsburgh’s Lost Steamboat: Treasures of the Arabia, We Can Do It! WWII, Toys of the ’50s, ’60s and ’70s, #Pixburgh: A Photographic Experience, Destination Moon: The Apollo 11 Mission.

In 2014, the History Center opened the new Museum Conservation Center, located directly behind the museum on Penn Avenue. The nine-story building houses the museum’s artifacts under one roof with Smithsonian-quality storage.

Children playing on the trolley in the Great Hall of the Heinz History Center.
Heinz History Center
As Pittsburgh’s “people museum,” the History Center preserves and interprets the history of Western Pennsylvanians through six floors of interactive exhibitions that feature iconic artifacts like the TV set from “Mister Rogers’ Neighborhood” and the world’s oldest jeep.

Plan your visit
A statue of a football player in the Western Pennsylvania Sports Museum.
Western Pennsylvania Sports Museum
The Western Pennsylvania Sports Museum, located on the History Center’s second and third floors, celebrates the unsurpassed sports legacy of the City of Champions. From football to baseball and hockey to golf, the Sports Museum highlights the region’s passion for amateur and professional sports.

Plan your visit

Fort Pitt Museum
The Fort Pitt Museum, located in historic Point State Park in Downtown Pittsburgh, is a two-floor, 12,000-square-foot museum that presents the story of Western Pennsylvania’s essential role during the French & Indian War, the American Revolution, and as the birthplace of Pittsburgh.

Plan your visit

Meadowcroft Rockshelter and Historic Village
Meadowcroft Rockshelter and Historic Village, the oldest site of human habitation in North America, is located in Avella, Washington County, Pa. Meadowcroft Rockshelter, a National Historic Landmark, features 19,000-year-old evidence of the region’s earliest inhabitants under a massive rock overhang. In addition to the Rockshelter, the site is also home to three outdoor historic areas, including a 16th century Indian village, 18th century Frontier Trading Post, and 19th century village that help visitors experience life over the past 500 years.

Plan your visit
A person reads a book at a desk in the library.
Detre Library & Archives
More than 250 years of our region’s history can be found at the Thomas and Katherine Detre Library & Archives. Founded in 1879, the Library & Archives’ collections are accessible to researchers, students, and the general public.

Plan your visit
Financial Information
The History Center recently earned the coveted 4-star rating from Charity Navigator for sound fiscal management and commitment to accountability and transparency, so you can be confident that your donation is going to a great place.

"""

In [None]:
raw_museum['https://www.heinzhistorycenter.org/whats-on/exhibits/'] = """

Heinz History Center
Explore exhibits that cover over 250 years of Pittsburgh history at the History Center.

Browse Exhibits

Western Pennsylvania Sports Museum
Learn how Pittsburgh became the City of Champions through exhibits at the Sports Museum.

Browse Exhibits

Fort Pitt Museum
Discover the world-shaping events that occurred in Western Pennsylvania through exhibits at the Fort Pitt Museum.

Browse Exhibits

Meadowcroft Rockshelter & Historic Village
Check out 19,000 years of Western Pennsylvania history through exhibits at Meadowcroft Rockshelter and Historic Village.

Browse Exhibits

Past Exhibits
Learn more about the History Center’s past exhibits, like We Can Do It! WWII, Destination Moon, 1968, and more.

Browse Exhibits"""

In [None]:
raw_museum['https://www.heinzhistorycenter.org/events/'] = """March 12
Graphic that reads “Gut Yontif: A Patchwork Holiday Experience.”
Heinz History Center – 6:30 PM
Gut Yontif: A Patchwork Holiday Experience
The Heinz History Center’s Rauh Jewish History Program & Archives will celebrate the Jewish holiday of Purim with a one-night only, participatory experience that blends hand-crafted art and tradition.

Tickets Required

March 15

Heinz History Center – 7:30 AM
National History Day Pittsburgh
Join the History Center for the region’s National History Day competition.

March 16

Heinz History Center – 7:30 AM
National History Day Pittsburgh
Join the History Center for the region’s National History Day competition.

March 22

Heinz History Center – 11:30 AM
American Girlhood: A Window into History
Celebrate the joy of girlhood during a multigenerational celebration of storytelling and history.

Tickets Required

March 29

Heinz History Center – 2:00 PM
Vietnam Veterans Day 50th Anniversary Commemoration
Join the Veterans Breakfast Club on National Vietnam War Veterans Day.

Tickets Required

April 5

Heinz History Center – 10:00 AM
Vintage Pittsburgh
Old is new again at the Heinz History Center’s 11th annual Vintage Pittsburgh retro fair!

April 12

Fort Pitt Museum – 1:00 PM
Speaker Saturday: “The Surveyor and the Silversmith”
Join Fort Pitt Museum for an afternoon with author C. Prentiss Orr.

April 17

Heinz History Center – 11:00 AM
USCIS Naturalization Ceremony
Join us for a Citizenship Ceremony.

April 28
colorful graphic with the text: Educator Open House
Heinz History Center – 5:30 PM
Spring Educator Open House
Explore the museum and learn about History Center teacher resources.

Tickets Required

May 4

Heinz History Center – 7:00 PM
Kabbalah and the Rupture of Modernity: Book Launch
A panel of scholars will discuss Dr. Eli Rubin’s book.

Tickets Required

June 5

Heinz History Center – 6:30 PM
32nd Annual History Makers Award Dinner
Save the date!

Tickets Required
"""

In [None]:
raw_museum['https://www.heinzhistorycenter.org/'] = """Get to Know Our Family of Museums

Heinz History Center
10 AM - 5 PM


Western Pennsylvania Sports Museum
10 AM - 5 PM


Fort Pitt Museum
10 AM - 5 PM


Meadowcroft Rockshelter and Historic Village
REOPENS MAY 2025


Kids Receive Free Admission
Heinz History Center

Through March 31, kids can visit the History Center and Fort Pitt Museum free of charge, thanks to community partners UPMC and UPMC Health Plan!

Plan YOur Visit

American Girlhood: A Window into History
Event

 Heinz History Center

Celebrate the joy of girlhood during a multigenerational celebration of storytelling and history on Mar. 22!

Learn More
What's On: Events
Search our calendar of upcoming events hosted by our family of museums!

View All Events
March 15

Heinz History Center – 7:30 AM
National History Day Pittsburgh
Join the History Center for the region’s National History Day competition.

March 29

Heinz History Center – 2:00 PM
Vietnam Veterans Day 50th Anniversary Commemoration
Join the Veterans Breakfast Club on National Vietnam War Veterans Day.

Tickets Required

April 5

Heinz History Center – 10:00 AM
Vintage Pittsburgh
Old is new again at the Heinz History Center’s 11th annual Vintage Pittsburgh retro fair!

April 12

Fort Pitt Museum – 1:00 PM
Speaker Saturday: “The Surveyor and the Silversmith”
Join Fort Pitt Museum for an afternoon with author C. Prentiss Orr.

Girl reading a book in the Detre Library and Archives.
Preserving Pittsburgh’s Memories
Detre Library & Archives

Heinz History Center

Research family history, explore historic images, and search thousands of documents in the Detre Library & Archives at the History Center. Open Wednesday through Saturday and free to all visitors.

Research & Explore
More than a ketchup museum.
The Heinz History Center is Pittsburgh’s people museum. We share the inspiring stories of Western Pennsylvania’s people who have helped change the course of American history. See for yourself.

Explore Exhibits

Kids & Families
Build bridges in the interactive Discovery Place or explore the Neighborhood of Make–Believe.

various sports museum merchandise
Unique Pittsburgh Gifts
From exclusive Heinz merch to the Mister Rogers kindness collection, find the perfect Pittsburgh gift at the Museum Shop.


Smithsonian Treasures
Discover Smithsonian artifacts at the History Center and learn more about the museum’s Smithsonian affiliation.


Explore Our Collections
Thousands of artifacts and historic images, at your fingertips."""

In [None]:
raw_museum['https://carnegiemuseums.org/about-us/our-museums/'] = """
Carnegie Museum of Art
Carnegie Museum of Art was the first museum in the United States with a strong focus on contemporary art, instructed by its founder, Andrew Carnegie, to collect the “old masters of tomorrow” at the inception of the Carnegie International in 1896. Today, it’s one of the most dynamic major art institutions in the country.

Carnegie Museum of Natural History
As one of the country’s largest and most respected natural history museums, Carnegie Museum of Natural History plays a critical role in both the Pittsburgh region and the international scientific community by conducting research and presenting exhibitions and programs that highlight the interdependence of humanity and nature, and advocating for the protection of the earth and its inhabitants. The museum’s millions of objects and specimens form one of the world’s great archives of biodiversity and the history of life.

Carnegie Science Center
Carnegie Science Center, soon to be the Daniel G. and Carole L. Kamin Science Center, is dedicated to inspiring learning and curiosity by connecting science and technology with everyday life. By making science both relevant and fun, the Science Center’s goal is to increase science literacy in the region and motivate young people to seek careers in science and technology. As the region’s most-visited museum, the Science Center reaches 500,000 people annually through hands-on exhibits, camps, classes, and off-site programs.

The Andy Warhol Museum
The most comprehensive single-artist museum in the world, The Andy Warhol Museum illuminates the art, life, and times of one of the most influential American artists of the 20th century. Combining artworks, images, and objects from Warhol's colorful life, the museum takes visitors on a tour of Andy Warhol's personal and professional life—from Andy Warhol the Pittsburgh art student to Andy Warhol the Pop icon. With some 500,000 artworks and objects, the museum is the global keeper of Warhol's legacy.
"""

In [None]:
raw_museum['https://carnegiemuseums.org/about-us/our-history/'] = """
The history of Carnegie Museums is bookended by two periods of fantastic growth and diversification. It all started in November of 1895 with the founding of Carnegie Institute, known today as Carnegie Museums of Pittsburgh. Andrew Carnegie called the original Oakland building—which included Carnegie Library of Pittsburgh—his “monument,” and he considered it the chief satisfaction of his life. That’s saying a lot, considering that this son of a poor Scottish weaver grew up to become one of the world’s most successful entrepreneurs and philanthropists.

A year later, Carnegie Museums held the first Carnegie International, second only to the Venice Biennale as the world’s oldest and most prestigious international exhibition of contemporary art, and the museum’s collection of art quickly grew as a result. Two years later, in response to news that the bones of “colossal” prehistoric creatures were being found out west, Carnegie sent a scientific crew to Wyoming, where they would discover the first of many world-famous dinosaurs.

A historic photo of a group of people at the 1896 Carnegie International
The first Carnegie International, 1896


To make room for the display of his namesake, Diplodocus carnegii, Andrew Carnegie broke ground on Carnegie Museums’ first major expansion in 1904. While the most famous product of that growth was the great Dinosaur Hall, its neighboring hall, the Hall of Architecture, perhaps best embodied Andrew Carnegie’s desire to “bring the world” to the people of Pittsburgh. He knew that most of those people would never leave their neighborhoods, their city, let alone the country. So he created a room filled with the casts of some of the greatest architectural wonders of the world. The hall itself is modeled after one of the seven wonders of the ancient world, the Mausoleum at Halicarnassus, and its rare collection of casts is the largest on display anywhere in the world today.

A historic photo of the Oakland Museum
The 1907 expansion

A historic photo of the original dinosaur hall
Early Dinosaur Hall

a historic photo of the hall of architecture
The Hall of Architecture

Adding two new museums
Carnegie Museums’ second most ambitious period of growth began in 1974, with the opening of the Sarah Scaife Galleries, which gave Carnegie Museum of Art’s constantly growing collections their own elegant space. Between 1980 and 1993, Carnegie Museum of Natural History added a number of new exhibit halls: Hillman Hall of Minerals & Gems, Polar World, the Benedum Hall of Geology, the Walton Hall of Ancient Egypt, and the Hall of African Wildlife.

The Carnegie

In 1986, the expanding cultural powerhouse became known as “The Carnegie,” and to this day, many in the region still refer to Andrew Carnegie’s Pittsburgh museums as The Carnegie.

In 1991, Carnegie Science Center entered the Carnegie Museums fold through a partnership between Carnegie Museums and the Buhl Foundation. Home to Buhl Planetarium, the Miniature Railroad & Village, and hundreds of interactive science exhibits, the Science Center now attracts more than half a million people a year. By the time of its 10-year anniversary in 2001, the Science Center opened Highmark SportsWorks, the largest collection of sports and science exhibits in the world.

exterior view of Carnegige Science Center in 1991
Carnegie Science Center

Three years later, Carnegie Museums again brought something new and irreplaceable to Pittsburgh, and the world: The Andy Warhol Museum. In addition to presenting the vast archives of Andy Warhol, the museum also has become a center for dialogue about a diverse range of topics, as well as a global ambassador to Pittsburgh’s cultural richness and diversity through its many traveling exhibitions.

exterior of The Andy Warhol Museum
The Andy Warhol Museum

Today’s ever-changing experiences
By the year 2000, the four Carnegie Museums became known as “Carnegie Museums of Pittsburgh.” The following two decades included the debut of Carnegie Museum of Natural History’s blockbuster Dinosaurs in Their Time exhibit, which is nearly three times the size of the original dinosaur hall and the first exhibit in the world to feature scientifically accurate, immersive environments spanning the Age of Dinosaurs. In 2003 and again in 2012, Carnegie Museum of Art accomplished its most important and most visitor-focused transformations since the opening of the Scaife wing with the reopening of its Scaife Galleries after year-long renovations, which included new presentations of its vast collections and the display of more artwork. In 2014, The Andy Warhol Museum rehung its collection, telling the story of Andy Warhol’s life and artwork chronologically for the first time to spectacular effect. In 2017, Carnegie Museums launched a collaborative event series, Carnegie Nexus, that reaches across the arts and sciences to design experiences that animate pressing issues of our time and instigate new ways to examine our world through live performance, the visual and literary arts, and thoughtful conversation. And in 2018, Carnegie Science Center completed a major expansion: PPG Science Pavilion, which houses a suite of STEM Learning Labs; a Special Exhibitions Gallery for large-scale changing exhibitions; a laser digital giant-screen theater; and outdoor learning spaces on the riverfront."""

In [None]:
raw_museum['https://carnegiemuseums.org/plan-a-visit/'] = """
Plan a Visit
Carnegie Museums of Art and Natural History
ADDRESS:
4400 Forbes Avenue
Pittsburgh, PA 15213
412.622.3131

HOURS:
Mon., Wed., Fri.-Sun. 10 a.m.–5 p.m.
Thurs.: 10 a.m.–8 p.m.;
Closed Tuesdays

HOLIDAYS/CLOSINGS:
The museums are closed on Easter, Thanksgiving, Christmas, and New Year’s Day.

ADMISSION:
Plan your next visit with timed ticketing.

Adults: $25
Seniors (65+): $20
Students with ID/Children age 3–18: $15
Members/Children 2 and under: Free
Weekdays after 3 p.m.: Half-price admission

SPECIAL OFFERS:
See discounts and offers for active U.S. Military, students, teachers, and more.

PARKING:

Our parking garage is located just behind the museum. Parking at the museums is now based on hourly rates, and guests will pull a ticket from the parking machines upon entry. Upon exiting, guests will have the option to pay with mobile pay via a QR code on ticket, at one of two pay stations, or at the exit gate. Cash is not an accepted form of payment for parking.

See our parking rates

We offer designated accessible spaces in the garage for visitors with disabilities. The clearance is six feet, eight inches. If you need alternate arrangements in advance for oversized vehicles, please call us at 412.622.3131, 9 a.m. to 5 p.m., Monday through Friday.

DINING:
Café Carnegie is open for on-site dining or for takeout on Monday, Tuesday, Wednesday, Friday, Saturday, and Sunday from 11 a.m.-3 p.m., Thursday 11 a.m. to 3 p.m., 5 p.m. to 8 p.m. and the Coffee and Wine Bar is open Monday, Tuesday Wednesday, Friday, Saturday, and Sunday from 9a.m.-5 p.m., Thursday 9 a.m. to 8 p.m. Fossil Fuels is open Monday, Wednesday, Thursday, and Friday 8 a.m. to 4 p.m., Tuesday 8 a.m. to 2 p.m., Saturday, Sunday 10 a.m. to 4 p.m.

ACCESSIBILITY: The museums are continually working to make programs and exhibitions accessible to all visitors.

Carnegie Science Center
ADDRESS:
One Allegheny Avenue
Pittsburgh, PA 15212
412.237.3400

HOURS:
Open Daily 10 a.m.–5 p.m.;
Closed Tuesdays

CLOSINGS/HOLIDAYS:
Closed Thanksgiving, Christmas, and dates of home Steelers games and other major events at Acrisure Stadium. See the Science Center’s Museum Hours page for complete details.

ADMISSION:
Plan your next visit with timed ticketing.

See pricing.

SPECIAL OFFERS:
See discounts and offers.

PARKING:
$6 per vehicle. You can use the pay station to pay for parking at the beginning of your visit, as you are leaving, or any time in between.

DINING:
The River View Café is open from 10 a.m.-3 p.m.

ACCESSIBILITY:
The museums are continually working to make programs and exhibitions accessible to all visitors.





Carnegie Museums Visitor Conduct Policy
The Andy Warhol Museum
ADDRESS:
117 Sandusky Street
Pittsburgh, PA 15212-5890
412.237.8300

HOURS:
Mon., Wed., Thurs., Sat., Sun. 10 a.m.–5 p.m.
Fri. 10 a.m.–10 p.m. (half price 5–10 p.m.)
Closed Tuesdays

HOLIDAYS/CLOSINGS:
The museum is closed on Easter, Thanksgiving, Christmas, and New Year’s Day.

ADMISSION:
Plan your next visit with timed ticketing.

Adults: $25
Students with valid ID: $13
Seniors (65+): $13
Children (3-18): $13
Members/Children 2 and under: Free
Fridays 5–10 p.m.: Half-price admission

SPECIAL OFFERS:
Half-priced admission from 5–10 p.m. every Friday. See additional discounts and offers.

PARKING:
Museum parking is located one block north of the museum on Sandusky Street ($8). Additional public parking is available behind the museum in the General Robinson Street parking garage (average $6, prices may vary during nearby stadium events).

DINING:
The Warhol Café is open during museum hours and accessible without museum admission.

ACCESSIBILITY:
The museums are committed to providing excellent experiences for all visitors. Learn more about accessibility accommodations.

"""

In [None]:
raw_museum['https://carnegiemuseums.org/events/'] = """
Jan 18, 2025 - Jun 1, 2025
Gertrude Abercrombie: The Whole World is a Mystery
Art
FeaturedEvent
Exhibitions
Mar 1, 2025 - Aug 17, 2025
Mental Health: Mind Matters
Science Center
FeaturedEvent
Exhibitions
Mar 22, 2025
NatureFest
Natural History
FeaturedEvent
Activities
Every Wednesday
Youth Open Studio
Warhol
Activities
|
Just for Teens
Every Friday
Good Fridays
Warhol
After-hours Events
Last Saturday of each month
Dandy Andy: Warhol’s Queer History
Warhol
Talks & Tours
|
Just for Adults
FRIDAY THROUGH SUNDAY
The Factory – Fridays
Warhol
Activities
|
Kids/Family
FRIDAY THROUGH SUNDAY
The Factory – Saturdays
Warhol
Activities
|
Kids/Family
FRIDAY THROUGH SUNDAY
The Factory – Sundays
Warhol
Activities
|
Kids/Family
Saturdays
Fab Lab Flash Workshop
Science Center
Activities

Fridays through May 9, 2025
Carnegie Lab – Friday
Art
Activities
|
Kids/Family
Every Saturday through May 10, 2025
Carnegie Lab – Saturday
Art
Activities
|
Kids/Family
Sep 21, 2024 - Jun 15, 2025
Tatiana Bilbao Estudio: City of Rooms
Art
Exhibitions
Saturdays
Storytime at the Museum
Natural History
Activities
|
Kids/Family
Mar 1, 2025 - Jul 27, 2025
Gala Porras-Kim
Art
Exhibitions
Mar 13, 2025
Teen Night: Medical Marvels
Science Center
Activities
|
Just for Teens
Mar 14, 2025
21+ Night: Pi Day
Science Center
After-hours Events
|
Just for Adults
Mar 14, 2025
Meet Mario, the new director of The Warhol
Warhol
After-hours Events
Mar 15, 2025
Story Saturday: What Can a Mess Make?
Art
Activities
|
Kids/Family
Mar 17, 2025
Moriarty Science Seminar: Using Bird Banding Datasets to Track Trends in Bird Populations
Natural History
Talks & Tours

Mar 20, 2025
The Network Factory IX: Neighbor to Neighbor
Warhol
After-hours Events
Mar 22, 2025 - Jul 13, 2025
Raymond Saunders: Flowers from a Black Garden
Art
Exhibitions
Mar 22, 2025
Teen Sensory Friendly Silent Disco
Warhol
Activities
|
Just for Teens
Apr 12, 2025
Youth Exhibition Opening Celebration
Art
Activities
Jul 12, 2025 - Jan 25, 2026
Fault Lines: Art, Imperialism, and the Atlantic World
Art
Exhibitions
Aug 22, 2025 - Feb 1, 2026
Charles Harlan
Art
Exhibitions
Aug 23, 2025 - Jan 11, 2026
after school
Art
Exhibitions
"""

In [None]:
raw_museum['https://carnegiemuseums.org/things-to-do/explore-our-collections/'] = """
Explore Our Collections
Individually, Carnegie Museum’s collections are impressive. Together, they’re unparalleled in scope and diversity. Our museums are home to some of the most historically significant and artistically brilliant collections in the world—the foundation of the museums’ expansive research, exhibitions, and educational offerings.

Carnegie Museum of Art
With a collection of more than 30,000 works, areas of excellence include Old Master prints and drawings, nearly 3,000 Japanese prints, Impressionism, the expansive Teenie Harris Archive, our film and video archive, and contemporary art from the 1980s to the present, along with iconic works from other eras.

Carnegie Museum of Natural History
A wondrous collection of millions of objects and scientific specimens used every day to broaden our understanding of the history of Earth, evolution, conservation, and biodiversity.

The Andy Warhol Museum
It’s the definitive collection of all things Warhol, including the artist’s 610 Time Capsules and 12,000 paintings, drawings, prints, photographs, sculpture, films, and videos. Archives include more than 500,000 objects.

Nights at the Museums
Sleepovers, adults-only fun, stargazing, and more!

Learn With Us
From curious preschoolers to art-loving great-grandparents and every age and interest in between, Carnegie Museums is the destination for lifelong learning.

View Our Exhibitions
Dinosaurs and the art of Andy Warhol. A World War II sub, original Monets, and the largest exhibition dedicated to robots.

Super Science Saturdays
Get up-close with dinosaurs, discover secrets of DNA, and learn what it’s like to be a scientist in the field.

"""

In [None]:
raw_museum['https://carnegiemuseums.org/'] = """
More than 80 ancient Egyptian objects
Through Summer 2025

Every Egyptian object in the museum’s care has stories to tell, about its creation and original use, its journey to Pittsburgh, and about the lives of those in ancient Egypt. The Stories We Keep invites visitors to see these objects—cared for by the Museum for more than a century—in a new light and to witness the work that will preserve them for future generations.

Ongoing

Winners of the 2024 Inclusive Call for Art
Carnegie Museums of Pittsburgh, Carnegie Library of Pittsburgh, the University of Pittsburgh, 1Hood Media, and the August Wilson African American Cultural Center again joined together to host the second iteration of Envisioning a Just Pittsburgh, an inclusive call for art that encouraged artists throughout southwestern Pennsylvania to share their visions for a just and equitable Pittsburgh. We are proud to announce the First and Second Place winners in the competition’s six categories.

Through Aug. 17

Breaking stigma. Building understanding.
Step into an immersive journey designed to break stigma, build understanding, and spark important conversations. The Mental Health: Mind Matters traveling exhibition brings the science of mental health to life at the Science Center through immersive exhibits and personal stories.

 Jan. 18–June 1

Gertrude Abercrombie: The Whole World Is a Mystery
Featuring loans from important institutional and private collections, this exhibition is the most comprehensive museum presentation of the works of Gertrude Abercrombie, a critical figure in the mid-20th-century Chicago art scene from the 1930s until her death in 1977.

Through March 31

Women’s History Month with Carnegie Museums
Women’s History Month is a time to celebrate the contributions, achievements, and resilience of women throughout history. Carnegie Museums join in highlighting the vital role women have played in shaping society, from science and arts to politics and activism. It’s an opportunity to honor trailblazers, amplify women’s voices, and inspire future generations."""

In [None]:
cleaned_museum = {url: clean_data(text) for url, text in raw_museum.items()}

In [None]:
museum_urls = list(raw_museum.keys())

In [None]:
len(museum_urls)

13

In [None]:
# manual check
cleaned_museum[museum_urls[3]]


'More than 80 ancient Egyptian objects Through Summer 2025 Every Egyptian object in the museums care has stories to tell, about its creation and original use, its journey to Pittsburgh, and about the lives of those in ancient Egypt. The Stories We Keep invites visitors to see these objectscared for by the Museum for more than a centuryin a new light and to witness the work that will preserve them for future generations. Ongoing Winners of the 2024 Inclusive Call for Art Carnegie Museums of Pittsburgh, Carnegie Library of Pittsburgh, the University of Pittsburgh, 1Hood Media, and the August Wilson African American Cultural Center again joined together to host the second iteration of Envisioning a Just Pittsburgh, an inclusive call for art that encouraged artists throughout southwestern Pennsylvania to share their visions for a just and equitable Pittsburgh. We are proud to announce the First and Second Place winners in the competitions six categories. Through Aug. 17 Breaking stigma. Bu

In [None]:
# write to json
save_to_json(cleaned_museum, "cleaned_museum.json")

# FOOD

In [None]:
# FOOD
food_urls = ['visitpittsburgh.com/events-festivals/food-festivals/', 'https://www.picklesburgh.com/',
             'https://www.pghtacofest.com/', 'https://www.pghtacofest.com/about', 'https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/',
             'https://littleitalydays.com/', 'https://littleitalydays.com/entertainment-schedule/',
             'https://bananasplitfest.com/', 'https://bananasplitfest.com/activities/', 'https://bananasplitfest.com/activities/food/',
             'https://bananasplitfest.com/events/', 'https://bananasplitfest.com/history/']

In [None]:
raw_food = scrape_data(food_urls)

Error scraping visitpittsburgh.com/events-festivals/food-festivals/: Invalid URL 'visitpittsburgh.com/events-festivals/food-festivals/': No scheme supplied. Perhaps you meant https://visitpittsburgh.com/events-festivals/food-festivals/?
Error scraping https://www.picklesburgh.com/: 403 Client Error: Forbidden for url: https://www.picklesburgh.com/
Error scraping https://littleitalydays.com/: 403 Client Error: Forbidden for url: https://littleitalydays.com/
Error scraping https://littleitalydays.com/entertainment-schedule/: 403 Client Error: Forbidden for url: https://littleitalydays.com/entertainment-schedule/
Error scraping https://bananasplitfest.com/activities/: 403 Client Error: Forbidden for url: https://bananasplitfest.com/activities/
Error scraping https://bananasplitfest.com/activities/food/: 403 Client Error: Forbidden for url: https://bananasplitfest.com/activities/food/
Error scraping https://bananasplitfest.com/events/: 403 Client Error: Forbidden for url: https://bananaspl

In [None]:
raw_food['https://bananasplitfest.com/history/'] = """History
The story of the All-American dessert that started in Latrobe
Since its invention in 1904, the Banana Split has become an iconic part of American dessert culture. Its history began when David Strickler, a 23-year-old “soda jerk” at a drugstore in Latrobe, Pennsylvania, started experimenting with different ice cream combinations. He cut a banana lengthwise, added three scoops of ice cream – typically vanilla, chocolate, and strawberry – and then topped it with sweet syrups, marshmallow, chopped nuts, whipped cream, and a maraschino cherry. It was an instant hit!

The Banana Split quickly gained popularity, particularly among the students at nearby St. Vincent College, who would frequent the drugstore to indulge in the delicious dessert. Strickler’s banana-based sundaes quickly became known as “Dr. Dave’s” among the students, who would rave about them to their hometown soda jerks, spreading the word about the new dessert.

Within a few years, the Banana Split gained national recognition. The dessert had become a staple of American dessert culture, and Strickler had become a pioneer of the ice cream industry.

Strickler went on to buy the Latrobe pharmacy where he had invented the Banana Split, and he even added an optical business upstairs. He was a staple in the local community until he passed away in 1971 at the age of 90.

In 2004, on the 100th anniversary of the Banana Split,  “Ice-Cream Joe” Gruble worked with the University of Pitt to commemorate Strickler’s invention with the “From Pitt Came the Split” Event.

The Great American Banana Split Festival has been a staple in Latrobe since 2013, when the Pennsylvania Historical Society unveiled a marker at the former site of Tassel’s Pharmacy where Sticker was working at the time of his invention, cementing Stricklers legacy in historical record.

Today, the Banana Split remains a beloved dessert across the United States and is often found on the menus of ice cream shops and restaurants. It is also the official dessert of Pennsylvania!


David Strickler


Inside Strickler’s Drug Store


Outside Strickler’s Drug Store

Photos courtesy of the Latrobe Historical Society

Honorary Chair
Ice Cream Joe Gruble
The legacy of Latrobe as the birthplace of the banana split might not have been fully realized except for the efforts of Ice Cream Joe. While locals knew of the town’s claim to the dessert, it was Joe who in 2004, on the 100th anniversary of the invention, worked with the University of Pittsburgh on the “From Pitt Came the Split” event. He was also part of the group of Latrobeans that submitted the petition for the Pennsylvania Historical Marker that was installed in 2013.

From 2015 until his passing in 2017, Joe served up ice cream, ideas, and smiles as the celebration’s honorary chair.
"""

In [None]:
raw_food['https://bananasplitfest.com/events/'] = """
5K Banana Run
Get ready to go bananas at the annual 5k Banana Run! This fun run is perfect for runners of all ages and fitness levels. Participants are encouraged to wear yellow and don their best banana-themed costumes as they run or walk the 3.1-mile course.

Check It Out
Blood Drive
Giving blood is one of the most selfless acts of kindness one can do. It only takes a small amount of time, but the impact it can have is immeasurable. Blood donations save lives every day, and give you the power to make a difference in someone’s life. Whether it’s for a family member, a friend, or a complete stranger, your donation can give someone a second chance at life. Take a quick break from the fun and donate while you’re here! It’s a simple act of kindness that can have a profound impact on someone in need.

Check It Out
Cornhole Tournament
Get ready to toss some bags and have some fun! This popular game is a great way to get outside and enjoy the company of friends and family. With multiple rounds and elimination matches, the stakes get higher as the competition heats up. Whether you’re a seasoned cornhole player or a beginner, this tournament is a fun way to spend an afternoon in the sun. So grab a partner and join in on the fun at the cornhole tournament!

Check It Out
Car Show
Get ready to rev your engines! This event is perfect for car enthusiasts of all ages who want to see some beautiful antiques and unique vehicles on display. The car show features a wide variety of makes and models with something for everyone. Owners are often on hand to talk about their cars, and visitors can get up close and personal with the vehicles, taking in every detail. So come on out and enjoy!

Check It Out
Yellow Tie Gala
The Latrobe Art Center welcomes in the fun-filled weekend with a banana extravaganza! Enjoy a live music performance with Ricolita Café food, a yellow tie cocktail, and even a banana spit bar. Don’t forget to wear your best yellow outfit!

Check It Out"""

In [None]:
raw_food['https://bananasplitfest.com/activities/food/'] = """
Savor local food vendors and eateries!
Food Vendors
Chocolate Moonshine Co.
Fairly Local Foods
Firehouse Subs
Hillbilly’s Jerky
Jackson M Concessions
Local Restaurants
512 Coffee & Ice Cream

Carmine’s Pizza & Pasta

Chef Dato’s Table

DeNunzio’s Italian Chophouse

Dino’s Sports Lounge

Gino Giannilli’s Homestyle Pizza

Gosia’s Pierogies

Lucky B’s

Mailey’s Provisions
Ricolita’s Café

Sharky’s Café / The Pier

Touchdown Club 2

Valley Dairy Restaurant
"""

In [None]:
raw_food['https://bananasplitfest.com/activities/'] = """
Banana Split Princess
Have the chance to meet the Banana Split Princess and her court throughout the weekend! They will be at various activities, and you can meet them at the Banana Split Princess booth!

Check It Out
Crafts, Games, & Activities
The Great American Banana Split Celebration has a lively atmosphere filled with crafts, games, and activities for all ages. Kids can enjoy face painting, balloon animals, and interactive games while parents can shop for handmade crafts, visit the farmers market, or relax with a game of cornhole. For the more adventurous, there’s even a rock climbing wall!

Check It Out
Vendors
Visiting the event’s vendors is a unique and rewarding experience that provides a chance to support the local community and find one-of-a-kind products. They offer a personal touch with friendly staff who are passionate about their products and happy to provide personalized recommendations. With a wide range of options, from handmade crafts to locally-sourced food and drinks, there’s something for everyone! Discover your new favorites this weekend and come explore all that Latrobe has to offer.

Check It Out
Eat & Drink
Discover and enjoy all sorts of yummy treats, from classic celebration food like funnel cakes and corn dogs to fresh eats from local food trucks. Enjoy cold drinks like lemonade or craft beer while listening to live music, creating unforgettable memories and experiencing some good old-fashioned fun!

Check It Out
Entertainment
Featuring live performances from both local and well-known bands, you won’t want to miss this year’s entertainment lineup!

Check It Out
Over 21 Area
Those over the age of 21 can indulge in a variety of craft brews and local wines, each with its own distinct flavors and character. From hoppy IPAs to rich red wines, there’s something for every palate. The section offers a more laid-back atmosphere where adults can socialize, unwind, and enjoy the fruits of local breweries and wineries. It’s a great way to discover new favorites and support the local community while having a fun, relaxing time with friends.

Check It Out
"""

In [None]:
raw_food['https://littleitalydays.com/entertainment-schedule/'] = """
Entertainment Schedule
(subject to change)
Thursday, August 15
Mini of Pittsburgh & First National Bank Stage at Cedarville & Liberty

5:30pm Tony Guarino - Singing Sinatra and other Italian hits

6:00pm Celebrity Bocce Tournament - Pittsburgh politicians Irish vs Italians plus Pittsburgh media/entertainment personalities

Friday, August 16
Mini of Pittsburgh & First National Bank Stage at Cedarville & Liberty

5:30pm John Vento's Italian Invasion - Performing a variety of hits featuring excellent vocalists

7:30pm Best Pizza in Bloomfield Competition - Featuring Angelo's Pizzeria, Caliente Pizza & Draft House, and Pizza Italia

Saturday, August 17
Mini of Pittsburgh & First National Bank Stage at Cedarville & Liberty

12:00pm John Lupone - Italian opera vocalist

2:30pm Mirella the Musician - Authentic Italian Accordionist

5:30pm The Cavaliers - Playing Italian-American hits

Froggy's Stage at Taylor & Liberty

12:00pm Untamed - All-female trio with excellent harmonies

3:00pm Pizza Acrobatics - 2x World Champion Matt Hickey of Caliente Pizza & Draft House

4:00pm The Nation's #1 Earth, Wind & Fire Tribute Band - Let's Groove Tonight

7:00pm Fleetwood Mac Tribute Band Silver Springs

First Commonwealth Stage at Gross & Liberty

12:00pm Miss Little Italy Pagaent

2:30pm Magic Moments - Pittsburgh's #1 Oldies Band

6:00pm Jerry DeMaria with the Pittsburgh Festival Opera - Singing Italian songs, Sinatra, Dean Martin, Louis Prima and more!

Sunday, August 18
Mini of Pittsburgh Stage at Cedarville & Liberty

12:00pm Joni Marie- Soprano vocalist of Opera

1:30pm Allegro Dance Company - Dance instructor Anna Harsh from Southern Italy teaching interactive Tarantella dance workshop

2:30pm Jackie Hooper Duo - Performing Italian hits

Froggy's Stage at Taylor & Liberty

12:00pm Elias Khouri - Talented young singer and guitarist

2:30pm America’s #1 Jacksons Tribute Band Dancing Machine - Performing the music of Michael, Janet, and the Jackson 5

First Commonwealth Stage at Gross & Liberty

12:00pm East End Kids - Pittsburgh's premiere teen song & dance ensemble

1:30pm Mojo Hand - High-energy band with killer grooves

Accordionist Hank Edwardo strolling throughout the event"""

In [None]:
raw_food['https://littleitalydays.com/'] = """
Region's Largest Heritage Festival
Celebrating Our Roots & Supporting Bloomfield Businesses
2025 Dates
Thursday August 14, 5:00pm-9:00pm • Friday August 15, noon-9:00pm • Saturday August 16 noon-9:00pm • Sunday August 17, noon-5:00pm
4 Days • 3 Packed Stages • Over 30 Acts!
All Italian...All Spectacular...All Free!

Get The VIP Experience!
$19.95 for one, $34.95 for two VIP Experience Includes:

1 Little Italy Days T-Shirt

1 Slice of Pizza from Caliente Pizza

1 Bottle of water

VIP Restroom Access
Access to VIP tent for seating near main stage and to relax with shelter from the sun.

BUY VIP EXPERIENCE
Location
In Pittsburgh, PA on Liberty avenue in Bloomfield. From Ella St. to Gross St.


Parking & Seating
Please do not park in the permitted residential parking spaces in Bloomfield, respect the limited parking available for residents. Also, you will be ticketed. Use one of these options.
Limited seating is available, bring lawn chairs to be sure of getting a seat.

In Partnership
We are pleased to support Bloomfield Alliance, Bloomfield Business Network, Bloomfield Citizens Council, and Bloomfield Development Corporation.

Italians vs. Irish Celebrity Bocce Tournament
The Italians continued their winning streak for a third straight year! Come and see who takes the trophy home for 2023!

Celebrity Bocce Challenge - Politicians and media personalities compete on the bocce court!
Little Italy Days Bocce Tournament – Our annual tournament will be held on Saturday and Sunday. If you’d like to enter a team, reach out to Corrado or Adam at littleitalydaysbocce@gmail.com or 412-480-5918/412-600-5702 for details. The entry fee is $150 per team. The tournament will be double elimination and begins at 10 AM on Saturday morning and continues Sunday morning at 9AM. Teams must arrive by 9:30AM Saturday for bracket drawing, no exceptions.The bocce court is located on Cedarville Street next to the Pleasure Bar.

Miss Little Italy Pagaent
The Miss Little Italy is a family-oriented festival pageant, celebrating a special Bloomfield tradition. Contestants DO NOT need to be of Italian heritage or reside in the city of Pittsburgh. Event Fee: $35.00 for first child. Includes a participation gift for all. Miss Little Italy Winners receive a crown, sash & flowers.
"""

In [None]:
raw_food['https://www.picklesburgh.com/visit/'] = """
What is it?
Picklesburgh is the destination for all things pickled! Voted the #1 Specialty Food Festival in the Country four times by the readers of USA Today, Picklesburgh goes beyond the dill pickle to feature handcrafted food and artisan drinks featuring pickled ingredients from local chefs, restaurants, and more. Plus, don’t miss pickle-themed snacks and merchandise, fun contests, and entertainment set to the backdrop of picturesque Downtown Pittsburgh.

When is it?
Friday, July 11 – Sunday, July 13, 2025.

Where is it?
Downtown Pittsburgh. Location details to be announced soon.

How much does it cost?
Admission is always free – no tickets are necessary!
You must be 21+ to consume/purchase alcohol and proper ID will be required.

Where do I park?
No matter where you’re coming from there are lots of convenient parking options for Picklesburgh. Park near Picklesburgh or across town to avoid Festival traffic. Real-time parking availability all around Downtown: https://parkpgh.org/

What hotels are nearby?
There are several hotels near and around the event footprint. Check out these options in Downtown.
Guests can also book accommodations outside of Downtown and utilize public transportation to get to and from the event."""

In [None]:
raw_food['https://www.picklesburgh.com/'] = """
Picklesburgh 2025
July 11-13, 2025
Downtown Pittsburgh
Location details to be announced soon
Subscribe to get updates!

USA Today 10Best has once again named Picklesburgh as the 2025 Readers’ Choice Best Specialty Food Festival!
This is the fourth win for this beloved Pittsburgh event, and after a second-place finish last year, we’re DILLighted to bring the title back to the Golden Triangle!

We’re grateful to all the pickle-passionate fans who voted. We can’t wait to relish this sweet victory with you in July!
"""

In [None]:

cleaned_food = {url: clean_data(text) for url, text in raw_food.items()}

In [None]:
food_urls = list(cleaned_food.keys())

In [None]:
food_urls[2]

'https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/'

In [None]:
len(food_urls)

12

In [None]:
# manual check
cleaned_food[food_urls[2]]

"Annual Winter Theme: New Dishes for the New Year Div 1: Toggle NavigationGet InvolvedJoin Newsletterregister for restaurant week updates * indicates required Email Address * First Name Last Name Zip Code Close var fnames = new Array();var ftypes = new Array();fnames[0]=EMAIL;ftypes[0]=email;fnames[1]=FNAME;ftypes[1]=text;fnames[2]=LNAME;ftypes[2]=text;fnames[3]=ZIP_CODE;ftypes[3]=number; try { var jqueryLoaded=jQuery; jqueryLoaded=true; } catch(err) { var jqueryLoaded=false; } var head= document.getElementsByTagName(head)[0]; if (!jqueryLoaded) { var script = document.createElement(script); script.type = text/javascript; script.src = http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js; head.appendChild(script); if (script.readyState && script.onload!==null){ script.onreadystatechange= function () { if (this.readyState == complete) mce_preload_check(); } } } var script = document.createElement(script); script.type = text/javascript; script.src = http://downloads.mailchimp.c

In [None]:
# write to json
save_to_json(cleaned_food, "cleaned_food.json")

# 2. QA generation for train & test

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
import json
import torch



In [None]:
with open('cleaned_sport.json', 'r') as f:
    events_data = json.load(f)


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1. Try Mistral-7B

In [None]:
#  Mistral-7B
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
events_data['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/']

'In Pittsburgh, we bleed black and gold. Come see for yourself why Sporting News magazine awarded Pittsburgh the coveted "Best Sports City" title and why the USA TODAY 10 Best Reader\'s Choice poll named Pittsburgh as one of the winners of the "Best City for Sports" travel award. If its action you want, this city has it covered with the best of football, baseball, hockey and more. Grab your Terrible Towel and visit Acrisure Stadium to watch the six-time Super Bowl Champion Pittsburgh Steelers. Head to PPG Paints Arena and join in all the excitement when the Penguins take the ice. The Pirates make a perfect summer night complete as you watch the game from PNC Park, rated by Travel & Leisure as the "best baseball stadium in America!" Pittsburgh is officially "Sixburgh" as the Steelers became the first team in NFL history to win six Super Bowl titles! Steelers Nation spreads far and wide, so grab your Terrible Towel and come celebrate where it all originates: the Home of the World Champio

In [None]:
qa_pairs = []

# Process each event
# for event in events_data:
#     url = event['url']
#     content = event['contents']

url = 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'
content = events_data['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'][:256]
# Create prompt for QA generation
prompt = f"""
Generate 3 question-answer pairs about
Pittsburgh and CMU, including history, culture, trivia, and upcoming events
based on this website content:

Website content: {content}

Format each QA pair as:
Q: [Question]
A: [Answer]

Some example questions:
When was Carnegie Mellon University founded?
What is the name of the annual pickle festival held in Pittsburgh?
When was the Pittsburgh Soul Food Festival established?
Who is performing at X venue on Y date?

Make sure to include specific details from the event content. For questions with multiple valid answers,
you can include multiple answers and separate by a semicolon ;.
"""


result = pipe(prompt)[0]['generated_text']

generated_content = result[len(prompt):]

pairs = []
for pair in generated_content.split("Q: ")[1:]:  # Skip the first empty split
    parts = pair.split("A: ", 1)
    if len(parts) == 2:
        question = parts[0].strip()
        answer = parts[1].split("Q:", 1)[0].strip()  # Get everything until next Q: or end
        pairs.append({"question": question, "answer": answer, "event_url": url})

    qa_pairs.extend(pairs)



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
qa_pairs

[{'question': 'Which publication named Pittsburgh as the "Best City for Sports"?',
  'answer': 'Sporting News magazine',
  'event_url': 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'},
 {'question': 'Which publication named Pittsburgh as the "Best City for Sports"?',
  'answer': 'Sporting News magazine',
  'event_url': 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'},
 {'question': "Why did USA TODAY 10 Best Reader's Choice poll name Pittsburgh as a winner?",
  'answer': 'Pittsburgh was named as one of the winners of the "Best City for" category in the USA TODAY 10 Best Reader\'s Choice poll.',
  'event_url': 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'},
 {'question': 'Which publication named Pittsburgh as the "Best City for Sports"?',
  'answer': 'Sporting News magazine',
  'event_url': 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'},
 {'question': "Why did USA TODAY 10 Best Reader's 

## 2. GPT2 for QA generation

In [None]:
# # GPT 2 MODEL
# from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# # Load GPT-2 model and tokenizer
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# # Set up generation pipeline
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=512,
#     do_sample=True,
#     temperature=0.7,
#     top_p=0.9,
# )



Device set to use cuda:0


In [None]:
# # Example of generating QA pairs
# url = 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'
# content = events_data[url][:512]

# prompt = f"""
# Generate as many question-answer pairs as possible about Pittsburgh and CMU, including history, trivia, and upcoming events based on this website content:

# Website content: {content}

# Format each QA pair as:
# Q: [Question]
# A: [Answer]

# Some example questions:
# - When was Carnegie Mellon University founded?
# - What is the name of the annual pickle festival held in Pittsburgh?
# - Who is performing at X venue on Y date?

# Ensure answers are specific to the website content.
# """

# # Generate QA pairs
# result = pipe(prompt, max_length=256, num_return_sequences=1)
# generated_content = result[0]['generated_text']

# # Further extract and process QA pairs as before

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
# result

[{'generated_text': '\nGenerate as many question-answer pairs as possible about Pittsburgh and CMU, including history, trivia, and upcoming events based on this website content:\n\nWebsite content: In Pittsburgh, we bleed black and gold. Come see for yourself why Sporting News magazine awarded Pittsburgh the coveted "Best Sports City" title and why the USA TODAY 10 Best Reader\'s Choice poll named Pittsburgh as one of the winners of the "Best City for Sports" travel award. If its action you want, this city has it covered with the best of football, baseball, hockey and more. Grab your Terrible Towel and visit Acrisure Stadium to watch the six-time Super Bowl Champion Pittsburgh Steelers. Head to PPG Pai\n\nFormat each QA pair as:\nQ: [Question]\nA: [Answer]\n\nSome example questions:\n- When was Carnegie Mellon University founded?\n- What is the name of the annual pickle festival held in Pittsburgh?\n- Who is performing at X venue on Y date?\n\nEnsure answers are specific to the website

## 3. Phi-2 for QA generation

In [None]:
# # phi-2

# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# import torch

# # Load Phi-2 model - works well on Colab's limited resources
# model_name = "microsoft/phi-2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )

# # Set up generation pipeline
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=512,  # Reduced for memory
#     do_sample=True,
#     temperature=0.7
# )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
# # Simplified prompt for better results with smaller models
# prompt = f"""
# Generate 2 question-answer pairs about Pittsburgh and CMU, including history, trivia, and upcoming events based on this website content:

# Website content: {content[:512]}

# Format each QA pair as:
# Q: [Question]
# A: [Answer]

# Some example questions:
# - When was Carnegie Mellon University founded?
# - What is the name of the annual pickle festival held in Pittsburgh?
# - Who is performing at X venue on Y date?

# Ensure answers are specific to the website content.
# """

# result = pipe(prompt)
# generated_text = result[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
# generated_text

'\nGenerate 2 question-answer pairs about Pittsburgh and CMU, including history, trivia, and upcoming events based on this website content:\n\nWebsite content: In Pittsburgh, we bleed black and gold. Come see for yourself why Sporting News magazine awarded Pittsburgh the coveted "Best Sports City" title and why the USA TODAY 10 Best Reader\'s Choice poll named Pittsburgh as one of the winners of the "Best City for Sports" travel award. If its action you want, this city has it covered with the best of football, baseball, hockey and more. Grab your Terrible Towel and visit Acrisure Stadium to watch the six-time Super Bowl Champion Pittsburgh Steelers. Head to PPG Pai\n\nFormat each QA pair as:\nQ: [Question]\nA: [Answer]\n\nSome example questions:\n- When was Carnegie Mellon University founded?\n- What is the name of the annual pickle festival held in Pittsburgh?\n- Who is performing at X venue on Y date?\n\nEnsure answers are specific to the website content.\nAssistant: Q: In what year 

## 4. Final: T5 for QA generation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


model_name = "google/flan-t5-xl"  # Flan-T5-XL model
tokenizer_t5 = AutoTokenizer.from_pretrained(model_name)
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")


pipe_t5 = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    num_beams=5,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

In [None]:
def split_content(content, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(content):
        end = min(start + chunk_size, len(content))
        chunks.append(content[start:end])
        start += chunk_size - overlap
    return chunks

# chunk large content
def process_content_in_chunks(url, content, questions_per_chunk=5, pipe=pipe):
    qa_pairs = []
    chunks = split_content(content)

    for i, chunk in enumerate(chunks):
        prompt = f"""
        Generate {questions_per_chunk} question-answer pairs about
        Pittsburgh and CMU, including history, culture, trivia, and upcoming events
        based on this website chunk:
        Website name: {url}, website content: {chunk}

        Format each QA pair as:
        Q: [Question]
        A: [Answer]

        Focus only on information present in this specific chunk.
        Make questions specific and directly answerable from the content.
        For questions with multiple valid answers,
        you can include multiple answers separated by a semicolon (;).
        """

        result = pipe(prompt)[0]['generated_text']
        generated_content = result[len(prompt):]

        pairs = []
        for pair in generated_content.split("Q: ")[1:]:
            parts = pair.split("A: ", 1)
            if len(parts) == 2:
                question = parts[0].strip()
                answer = parts[1].split("Q:", 1)[0].strip()
                pairs.append({"question": question, "answer": answer, "source_url": url, "chunk_id": i+1})

        qa_pairs.extend(pairs)

    return qa_pairs


In [None]:
# T5 QA generation on all topics

json_files = {
    "sport": "cleaned_sport.json",
    "music": "cleaned_music.json",
    "museum": "cleaned_museum.json",
    "food": "cleaned_food.json"
}

def load_json_file(file_path):
    """Load JSON data from a file."""
    with open(file_path, "r") as file:
        return json.load(file)

# save qa pairs
def save_qa_to_json(qa_pairs, category):
    output_file = f"{category}_qa.json"
    with open(output_file, "w") as file:
        json.dump(qa_pairs, file, indent=4)

# run t5 on each topic to generate qa
def process_all_categories():
    for category, file_path in json_files.items():
        print(f"Processing {category}...")

        urls_contents = load_json_file(file_path)

        qa_pairs = []
        for url, content in urls_contents.items():
            print(f"Processing URL: {url}")
            qa_pairs.extend(process_content_in_chunks(url, content, pipe=pipe))  # Use pipe_t5

        save_qa_to_json(qa_pairs, category)
        print(f"QA pairs for {category} saved to {category}_qa.json")

process_all_categories()


In [None]:
import torch
torch.cuda.empty_cache()


# 3. RAG system

In [None]:
# 4. RAG

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from torch import nn
from sentence_transformers import util
from transformers import pipeline
import re

def load_raw_contents(content_file):
    with open(content_file, 'r') as f:
        return json.load(f)

# to split text into smaller chunks
def split_text_into_chunks(text, chunk_size=300):
    words = text.split(' ')
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def chunk_raw_content(raw_contents, chunk_size=500):
    chunked_content = {}
    for url, content in raw_contents.items():
        chunks = split_text_into_chunks(content, chunk_size)
        chunked_content[url] = chunks
    return chunked_content


def encode_chunks(chunked_content, retriever):
    chunk_embeddings = []
    chunk_metadata = []
    for url, chunks in chunked_content.items():
        embeddings = retriever.encode(chunks, convert_to_tensor=True)
        chunk_embeddings.extend(embeddings)
        chunk_metadata.extend([{"url": url, "chunk_index": i} for i in range(len(chunks))])
    return chunk_embeddings, chunk_metadata


def search_raw_content(query, chunk_embeddings, retriever, top_k=5):
    query_embedding = retriever.encode(query, convert_to_tensor=True)

    if not isinstance(chunk_embeddings, torch.Tensor):
        chunk_embeddings = torch.stack(chunk_embeddings)

    query_embedding = query_embedding.unsqueeze(0)

    scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)
    results = [
        {"url": chunk_metadata[idx]["url"], "chunk_index": chunk_metadata[idx]["chunk_index"], "content": chunked_content[chunk_metadata[idx]["url"]][chunk_metadata[idx]["chunk_index"]]}
        for idx in top_results.indices
    ]
    return results

def generate_answer(model, tokenizer, query, retrieved_info):
    context_texts = [item["content"] for item in retrieved_info]
    context = "\n\n".join(context_texts)

    prompt = f"""
    Answer the question based on the following relevant information:
    {context}

    If the information is not enough to answer the question, please use your AI wisdom.

    Question: [{query}]

    Provide EXACTLY ONE final answer based on the information. Do not generate multiple answers.
    Please show your answer in the following format with answer texts wrapped in brackets:

    Final answer: [Answer]
    """
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    response = generator(prompt, max_new_tokens=150)[0]['generated_text']
    return response[len(prompt):]

#   extract the answer from the model's response
def get_final_answer(response):
    return re.findall(r'Final answer: \[([^\]]+)\]', response)[1]


def run_rag_pipeline(content_file, question, retriever, model):
#     qa_pairs = load_qa_pairs(qa_file)
    raw_contents = load_raw_contents(content_file)

    chunked_content = chunk_raw_content(raw_contents)

    chunk_embeddings, chunk_metadata = encode_chunks(chunked_content, retriever)

    relevant_info = search_raw_content(question, chunk_embeddings, retriever)

    response = generate_answer(model, tokenizer, question, relevant_info)

    final_answer = get_final_answer(response)
    return final_answer




In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load LLaMA model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
# Load retrieval model
retriever_name = "all-mpnet-base-v2"
retriever = SentenceTransformer(retriever_name)



In [None]:
# Load knowledge base
import json
qa_pairs = load_qa_pairs('pittsburgh_sports_qa.json')
raw_contents = load_raw_contents('cleaned_sport.json')



In [None]:
## RAG
question = "Where can I see the Pittsburgh Steelers?"
final_answer = run_rag_pipeline('cleaned_sport.json', question, retriever, model)
print(final_answer)

# 4. Create knowledge base

In [None]:
import pickle

# 1. merge all json knowledge base

def load_raw_contents(folder_path):
    raw_contents = {}
    all_files = os.listdir(folder_path)
    json_files = [file for file in all_files if file.endswith('.json')]


    for file in json_files:
        file_path = os.path.join(folder_path, file)
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                raw_contents.update(data)
                print(f"done {file}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    return raw_contents

# 2. Split one text into smaller chunks
def split_text_into_chunks(text, chunk_size=300):
    text = clean_data(text)
    words = text.split(' ')
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# 3. chunk all json
def chunk_raw_content(raw_contents, chunk_size=500):
    chunked_content = {}
    for url, content in raw_contents.items():
        chunks = split_text_into_chunks(content, chunk_size)
        chunked_content[url] = chunks
    return chunked_content

# 4. embed knowledge base
def encode_chunks(chunked_content, retriever):
    chunk_embeddings = []
    chunk_metadata = []
    for url, chunks in chunked_content.items():
        embeddings = retriever.encode(chunks, convert_to_tensor=True)
        chunk_embeddings.extend(embeddings)
        for i, chunk in enumerate(chunks):
            chunk_metadata.append({
                "url": url,
                "chunk_index": i,
                "content": chunk  # Save the actual text content here
            })
    return chunk_embeddings, chunk_metadata

# 5. save embedded base
def save_embeddings(chunk_embeddings, chunk_metadata, file_path="embeddings.pkl"):
    with open(file_path, 'wb') as f:
        pickle.dump({"embeddings": chunk_embeddings, "metadata": chunk_metadata}, f)

# 6. load base
def load_embeddings(file_path="embeddings.pkl"):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data['embeddings'], data['metadata']

def search_raw_content(query, chunk_embeddings, chunk_metadata, retriever, top_k=5):
    query_embedding = retriever.encode(query, convert_to_tensor=True)

    if not isinstance(chunk_embeddings, torch.Tensor):
        chunk_embeddings = torch.stack(chunk_embeddings)

    query_embedding = query_embedding.unsqueeze(0)

    scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    results = []
    for idx in top_results.indices:
        metadata = chunk_metadata[idx]
        url = metadata["url"]
        chunk_index = metadata["chunk_index"]

        # Retrieve actual content
        content = chunked_content[url][chunk_index]

        results.append({
            "url": url,
            "chunk_index": chunk_index,
            "content": content
        })

    return results

def generate_answer(model, tokenizer, query, retrieved_info):
    context_texts = [item["content"] for item in retrieved_info]
    context = "\n\n".join(context_texts)
    prompt = f"""
    Answer the question based on the following relevant information:
    {context}

    If the information is not enough to answer the question, please use your AI wisdom.

    Question: [{query}]

    Provide EXACTLY ONE final answer based on the information.
    BE SURE TO show your answer in the following format with answer texts wrapped in brackets:

    Final answer: [Answer]
    """
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    response = generator(prompt, max_new_tokens=150)[0]['generated_text']
    return response[len(prompt):]
#     return response

#   extract the answer from the model's response
def get_final_answer(response):
    try:
        # Try to find the answer in the strict format with []
        ans = re.findall(r'Final answer: \[([^\]]+)\]', response)[0]
        return ans
    except IndexError:
        try:
            # If the above fails, look for text after "Final answer:"
            ans = re.findall(r'Final answer:\s*(.*)', response)[0].strip()
            return ans
        except IndexError:
            return response


def run_rag_pipeline(question, retriever, model, tokenizer, chunk_embeddings, chunk_metadata, top_k=3):

    relevant_info = search_raw_content(question, chunk_embeddings, chunk_metadata, retriever, top_k=top_k)

    response = generate_answer(model, tokenizer, question, relevant_info)

    final_answer = get_final_answer(response)
    return final_answer

In [None]:
# merge json and get large knowledge base
import json
import os
embedding_file = "embeddings.pkl"
folder = 'knowledge_resource/'

In [None]:
###### ONLY NEED RUN ONCE #######
raw_contents = load_raw_contents(folder)


done cleaned_museum.json
done cmu_alumni_events.json
done Pittsburgh.json
done visitpittsburgh_all_combined.json
done cleaned_music.json
done citypaper_events.json
done cleaned_food.json
done pittsburgh_britannica.json
done cleaned_sport.json
done cmu_events.json
done tax_pdf_knowledge.json
done pittsburgh_website_data.json
done visitpittsburgh.json
done pittsburgh_2024_budget.json
done History_of_Pittsburgh.json
done downtown_pittsburgh_events.json
done pittsburgh_events.json
done cmu_about_alltext_with_pdf.json


In [None]:
# raw_contents.keys()

dict_keys(['https://www.cmu.edu/engage/alumni/events/campus/index.html', 'https://www.pghtacofest.com/', 'https://www.pghtacofest.com/about', 'https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/', 'https://bananasplitfest.com/', 'https://www.picklesburgh.com/', 'https://www.picklesburgh.com/visit/', 'https://littleitalydays.com/', 'https://littleitalydays.com/entertainment-schedule/', 'https://bananasplitfest.com/activities/', 'https://bananasplitfest.com/activities/food/', 'https://bananasplitfest.com/events/', 'https://bananasplitfest.com/history/'])

In [None]:
chunked_content = chunk_raw_content(raw_contents)


In [None]:
sum([len(v) for k,v in chunked_content.items()])



1721

In [None]:

chunk_embeddings, chunk_metadata = encode_chunks(chunked_content, retriever)


In [None]:
save_embeddings(chunk_embeddings, chunk_metadata, embedding_file)

## (Separate QA pairs)

In [None]:

def merge_json_files(folder_path, output_file):
    merged_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as f:
                data = json.load(f)
                merged_data.extend(data)  # Assuming each file contains a list of QA pairs

    with open(output_file, 'w') as f:
        json.dump(merged_data, f, indent=4)

In [None]:
qa_folder = 'qa_pairs/'
qa_out = 'pittsburgh_qa.json'

In [None]:
merge_json_files(qa_folder, qa_out)

In [None]:
with open('pittsburgh_qa.json', 'r') as f:
    qa_data = json.load(f)

questions_file = open('questions.txt', 'w')
reference_answers = {}

# extract QA pairs and write them to files
for idx, qa_pair in enumerate(qa_data):
    question = qa_pair['question']
    answer = qa_pair['answer']

    # Write question to questions.txt
    questions_file.write(question + '\n')

    # Add answer to reference_answers.json
    reference_answers[str(idx)] = answer

questions_file.close()

with open('reference_answers.json', 'w') as f:
    json.dump(reference_answers, f, indent=4)

# 5. Experiment

In [None]:
import re

In [None]:
######## NORMALLY ONLY NEED TO LOAD THE BASE ######

embedding_file = "embeddings.pkl"
chunk_embeddings, chunk_metadata = load_embeddings(embedding_file)

In [None]:
def load_questions(file_path):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f.readlines() if line.strip()]



In [None]:
!ls data/test

In [None]:

# questions = load_questions("data/test/questions.txt")


In [None]:
import csv

def load_questions(file_path):
    questions = []
    with open(file_path, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            # Assuming questions are in the first column
            if row:  # Ensure the row is not empty
                questions.append(row[0].strip())  # Adjust index based on the column
    return questions


In [None]:
questions = load_questions('test_set.csv')

## RAG generate answer for questions

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load LLaMA model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Load retrieval model
retriever_name = "all-mpnet-base-v2"
retriever = SentenceTransformer(retriever_name)



In [None]:
results = {}
for i, question in enumerate(questions):
    ans = run_rag_pipeline(question, retriever, model, tokenizer, chunk_embeddings, chunk_metadata)
    results[str(i)] = ans
    if i % 50 == 0:
        print(f"done {i}")


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


done 0


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 50


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 100


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 150


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 200


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 250


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 300


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 350


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 400


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 450


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 500


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

done 550


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Device set to use cu

In [None]:
with open('system_output.json', 'w') as f:
    json.dump(results, f, indent=4)

# Evaluate

In [None]:
def evaluate_system(system_output, reference_answers):
    precision_list, recall_list, f1_list, exact_match_list = [], [], [], []
    bad_performance = {}

    for q_id, system_answer in system_output.items():
        ref_answers = reference_answers[str(q_id)]

        # exact match
        exact_match = 1 if system_answer in ref_answers else 0
        exact_match_list.append(exact_match)

        # Precision, Recall, F1 using token
        system_tokens = set(system_answer.lower().split())
        ref_tokens = set(ref_answers.lower().split())

        true_positives = len(system_tokens & ref_tokens)
        precision = true_positives / len(system_tokens) if system_tokens else 0
        recall = true_positives / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

        if f1 < 0.4:
            bad_performance[q_id] = {
                'System Output': system_answer,
                'Reference Answers': ref_answers
            }

    with open('bad_performance.json', 'w') as f:
        json.dump(bad_performance, f, indent=4)

    metrics = {
        'Precision': sum(precision_list) / len(precision_list),
        'Recall': sum(recall_list) / len(recall_list),
        'F1': sum(f1_list) / len(f1_list),
        'Exact Match': sum(exact_match_list) / len(exact_match_list)
    }

    return metrics, precision_list, recall_list, f1_list, exact_match_list


def statistical_significance_test(system_scores, reference_scores):
    t_stat, p_value = ttest_rel(system_scores, reference_scores)
    return t_stat, p_value


def load_reference_answers(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [None]:
reference_answers = load_reference_answers("data/test/reference_answers.json")

In [None]:
metrics, precision_list, recall_list, f1_list, exact_match_list = evaluate_system(results, reference_answers)

In [None]:
from scipy.stats import ttest_ind
def compare_models(metric_list_1, metric_list_2):
    stat, p_value = ttest_ind(metric_list_1, metric_list_2)
    return p_value



In [None]:
compare_models(f1_list, f1_list2)