# Scraping serbian epic folk songs

The data is available on Wikipedia.

In [1]:
base_wiki_url = "https://sr.wikisource.org/wiki/%D0%A1%D1%80%D0%BF%D1%81%D0%BA%D0%B5_%D0%B5%D0%BF%D1%81%D0%BA%D0%B5_%D0%BD%D0%B0%D1%80%D0%BE%D0%B4%D0%BD%D0%B5_%D0%BF%D0%B5%D1%81%D0%BC%D0%B5"

## Firecrawl
The easiest way to scrape...

In [35]:
from dotenv import load_dotenv
import os

load_dotenv()

assert os.getenv("FIRECRAWL_API_KEY") is not None, "FIRECRAWL_API_KEY is not set"
assert os.getenv("GEMINI_API_KEY") is not None, "GEMINI_API_KEY is not set" # needed for later steps


In [12]:
from firecrawl import AsyncFirecrawlApp
from pydantic import BaseModel
import os

class Pesma(BaseModel):
    naziv: str
    url: str

class Ciklus(BaseModel):
    naziv_ciklusa: str
    pesme: list[Pesma]

class Ciklusi(BaseModel):
    ciklus: list[Ciklus]

async def scrape_ciklusi(url:str) -> Ciklusi:
    app = AsyncFirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
    response = await app.extract(
        urls=[
            url
        ],
        prompt="""This is a Wikipedia page containing the lists about Serbian folk songs.

Extract the names of cycles, and the songs that belong to that cycle, and the URL that leads to every one of those songs.""",
        schema=Ciklusi.model_json_schema()
    )
    
    return response


In [14]:
firecrawl_result = await scrape_ciklusi(base_wiki_url)

Save the scrape result.

In [None]:
import json

os.makedirs("./data", exist_ok=True)
with open("./data/ciklusi_scrape.json", "w") as f:
    json.dump(firecrawl_result.model_dump(), f, indent=4, ensure_ascii=False)

Let's check if the links are working

In [25]:
def check_links_status(firecrawl_data, timeout=10, delay=0.5):
    """
    Check if all links from firecrawl_result are working.
    
    Args:
        firecrawl_data: The firecrawl result data structure
        timeout: Request timeout in seconds (default: 10)
        delay: Delay between requests in seconds (default: 0.5)
    
    Returns:
        dict: Summary of link checking results
    """
    import time
    from urllib.parse import urlparse
    
    results = {
        'total_links': 0,
        'working_links': 0,
        'broken_links': 0,
        'failed_links': [],
        'cycles_status': []
    }
    
    # Extract cycles from the data structure
    cycles = firecrawl_data[0]['ciklus'] if firecrawl_data else []
    
    for cycle in cycles:
        cycle_name = cycle.get('naziv_ciklusa', 'Unknown Cycle')
        cycle_results = {
            'name': cycle_name,
            'total': 0,
            'working': 0,
            'broken': 0,
            'failed_songs': []
        }
        
        print(f"\nChecking cycle: {cycle_name}")
        print("-" * 50)
        
        songs = cycle.get('pesme', [])
        cycle_results['total'] = len(songs)
        results['total_links'] += len(songs)
        
        for i, song in enumerate(songs, 1):
            song_name = song.get('naziv', 'Unknown Song')
            song_url = song.get('url', '')
            
            if not song_url:
                print(f"  {i:2d}. ❌ {song_name} - No URL provided")
                cycle_results['broken'] += 1
                cycle_results['failed_songs'].append({
                    'name': song_name,
                    'url': song_url,
                    'error': 'No URL provided'
                })
                continue
            
            try:
                # Make request with timeout
                response = requests.get(song_url, timeout=timeout, allow_redirects=True)
                
                if response.status_code == 200:
                    print(f"  {i:2d}. ✅ {song_name}")
                    cycle_results['working'] += 1
                    results['working_links'] += 1
                else:
                    print(f"  {i:2d}. ❌ {song_name} - Status: {response.status_code}")
                    cycle_results['broken'] += 1
                    results['broken_links'] += 1
                    cycle_results['failed_songs'].append({
                        'name': song_name,
                        'url': song_url,
                        'error': f'HTTP {response.status_code}'
                    })
                    results['failed_links'].append({
                        'cycle': cycle_name,
                        'song': song_name,
                        'url': song_url,
                        'error': f'HTTP {response.status_code}'
                    })
                
            except requests.exceptions.Timeout:
                print(f"  {i:2d}. ⏰ {song_name} - Timeout")
                cycle_results['broken'] += 1
                results['broken_links'] += 1
                error_msg = f'Timeout after {timeout}s'
                cycle_results['failed_songs'].append({
                    'name': song_name,
                    'url': song_url,
                    'error': error_msg
                })
                results['failed_links'].append({
                    'cycle': cycle_name,
                    'song': song_name,
                    'url': song_url,
                    'error': error_msg
                })
                
            except requests.exceptions.RequestException as e:
                print(f"  {i:2d}. ❌ {song_name} - Error: {str(e)}")
                cycle_results['broken'] += 1
                results['broken_links'] += 1
                error_msg = str(e)
                cycle_results['failed_songs'].append({
                    'name': song_name,
                    'url': song_url,
                    'error': error_msg
                })
                results['failed_links'].append({
                    'cycle': cycle_name,
                    'song': song_name,
                    'url': song_url,
                    'error': error_msg
                })
            
            # Add delay between requests to be respectful
            if delay > 0:
                time.sleep(delay)
        
        results['cycles_status'].append(cycle_results)
        
        # Print cycle summary
        success_rate = (cycle_results['working'] / cycle_results['total'] * 100) if cycle_results['total'] > 0 else 0
        print(f"Cycle summary: {cycle_results['working']}/{cycle_results['total']} working ({success_rate:.1f}%)")
    
    return results


In [26]:
# Check all links from firecrawl_result
print("🔗 Checking all links from firecrawl_result...")
print("=" * 60)

# Run the link checking
link_check_results = check_links_status(firecrawl_result, timeout=10, delay=0.3)

# Print overall summary
print("\n" + "=" * 60)
print("📊 OVERALL SUMMARY")
print("=" * 60)
print(f"Total links checked: {link_check_results['total_links']}")
print(f"Working links: {link_check_results['working_links']} ✅")
print(f"Broken/Failed links: {link_check_results['broken_links']} ❌")

if link_check_results['total_links'] > 0:
    success_rate = (link_check_results['working_links'] / link_check_results['total_links']) * 100
    print(f"Success rate: {success_rate:.1f}%")
    
    if link_check_results['broken_links'] > 0:
        print(f"\n⚠️  Found {link_check_results['broken_links']} broken links:")
        for i, failed_link in enumerate(link_check_results['failed_links'][:10], 1):  # Show first 10
            print(f"  {i:2d}. {failed_link['song']} ({failed_link['cycle']})")
            print(f"      URL: {failed_link['url']}")
            print(f"      Error: {failed_link['error']}")
            print()
        
        if len(link_check_results['failed_links']) > 10:
            print(f"  ... and {len(link_check_results['failed_links']) - 10} more failed links")
else:
    print("No links found to check!")


🔗 Checking all links from firecrawl_result...

Checking cycle: Неисторијски циклус
--------------------------------------------------
   1. ✅ Бог ником дужан не остаје
   2. ✅ Ко крсно име слави, оном и помаже
   3. ✅ Опет то, али друкчије
   4. ✅ Свети Никола
   5. ✅ Ђакон Стефан и два анђела
   6. ✅ Заручница Лаза Радановића
   7. ✅ Кумовање Грчића Манојла
   8. ✅ Дунав се Савом оженио
Cycle summary: 8/8 working (100.0%)

Checking cycle: Преткосовски циклус
--------------------------------------------------
   1. ✅ Бан Милутин и Дука Херцеговац
   2. ✅ Бановић Страхиња
   3. ✅ Дарови светог Јована Владимира
   4. ✅ Дијете Јован и ћерка цара Стефана
   5. ✅ Душан хоће сестру да узме
   6. ✅ Женидба Душанова
   7. ✅ Женидба кнеза Лазара
   8. ✅ Женидба краља Вукашина
   9. ✅ Зидање Раванице
  10. ✅ Зидање Раванице, опет
  11. ✅ Зидање Скадра
  12. ✅ Како се крсно име служи
  13. ✅ Краљ Владимир и Свети Наум
  14. ✅ Милан-бег и Драгутин-бег
  15. ✅ Милош у Латинима
  16. ✅ Наход Момир
 

In [27]:
# Save link check results to JSON file
with open('./data/link_check_results.json', 'w', encoding='utf-8') as f:
    json.dump(link_check_results, f, ensure_ascii=False, indent=2)

print("💾 Link check results saved to './data/link_check_results.json'")

# Create a summary report by cycle
print("\n📋 DETAILED CYCLE ANALYSIS")
print("=" * 60)

for cycle_status in link_check_results['cycles_status']:
    name = cycle_status['name']
    total = cycle_status['total']
    working = cycle_status['working']
    broken = cycle_status['broken']
    
    success_rate = (working / total * 100) if total > 0 else 0
    
    status_icon = "✅" if broken == 0 else "⚠️" if success_rate >= 80 else "❌"
    
    print(f"{status_icon} {name}")
    print(f"   Working: {working}/{total} ({success_rate:.1f}%)")
    
    if cycle_status['failed_songs']:
        print(f"   Failed songs:")
        for failed_song in cycle_status['failed_songs'][:3]:  # Show first 3 failed songs
            print(f"     - {failed_song['name']}: {failed_song['error']}")
        if len(cycle_status['failed_songs']) > 3:
            print(f"     ... and {len(cycle_status['failed_songs']) - 3} more")
    print()

# Function to check specific cycle
def check_cycle_links(cycle_name, firecrawl_data):
    """Check links for a specific cycle only."""
    cycles = firecrawl_data[0]['ciklus'] if firecrawl_data else []
    
    for cycle in cycles:
        if cycle.get('naziv_ciklusa') == cycle_name:
            single_cycle_data = [{'ciklus': [cycle]}]
            return check_links_status(single_cycle_data)
    
    print(f"Cycle '{cycle_name}' not found!")
    return None

print("💡 TIP: You can check a specific cycle using:")
print("   check_cycle_links('Косовски циклус', firecrawl_result)")


💾 Link check results saved to './data/link_check_results.json'

📋 DETAILED CYCLE ANALYSIS
✅ Неисторијски циклус
   Working: 8/8 (100.0%)

✅ Преткосовски циклус
   Working: 28/28 (100.0%)

✅ Косовски циклус
   Working: 19/19 (100.0%)

✅ Песме о Марку Краљевићу
   Working: 29/29 (100.0%)

✅ Покосовски циклус
   Working: 23/23 (100.0%)

✅ Хајдучке и ускочке песме
   Working: 9/9 (100.0%)

✅ Песме о ослобођењу Србије и Црне Горе
   Working: 17/17 (100.0%)

✅ Песме народно-ослободилачке борбе
   Working: 5/5 (100.0%)

✅ Неразврстане
   Working: 7/7 (100.0%)

💡 TIP: You can check a specific cycle using:
   check_cycle_links('Косовски циклус', firecrawl_result)


Firecrawl's `extract` endpoint has a free limit of 500k/year tokens... And the first scrape took 67k tokens.
Instead of Firecrawl, I'll use Gemini to extract the poem from each page.

In [51]:
import os
from google import genai
from google.genai import types
from tenacity import retry, stop_after_attempt, wait_exponential

GEMINI_CLIENT= genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15))
async def extract_poem(url:str, model="gemini-2.5-pro-preview-06-05") -> str:
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=f"""Look at this webpage: {url}
and output the poem's text. DON'T OUTPUT ANYTHING ELSE.
"""),
            ],
        ),

    ]
    tools = [
        types.Tool(url_context=types.UrlContext()),
    ]
    generate_content_config = types.GenerateContentConfig(
        tools=tools,
        response_mime_type="text/plain",
    )

    response = await GEMINI_CLIENT.aio.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )

    text = response.text
    if text == "" or not text:
        raise ValueError(f"No text found on the page: {url}")
    return text

### Extract songs from each link

Use Firecrawl to get HTML, then use gemini to extract the poem.

In [1]:
from firecrawl import FirecrawlApp
import os
from google import genai
from google.genai import types
from tenacity import retry, stop_after_attempt, wait_exponential
from dotenv import load_dotenv
load_dotenv(override=True)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=4, min=1, max=60))
def get_html_from_url(url:str) -> str:
    firecrawl_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
    data = firecrawl_app.scrape_url(
        url,
        formats=["html"],
        only_main_content=True,
    )  
    html= data.html
    return html

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=4, min=60, max=60*3))
def get_poem_from_html(html:str) -> str:
    gemini_client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    poem_extraction_prompt = """
    You are given a HTML page that contains a Serbian song.
    Your task is to extract the song from the HTML.
    ONLY OUTPUT THE LYRICS, NO OTHER TEXT.
    DON'T OUTPUT ANYTHING ELSE.
    Keep the lyrics in the original language.
    One verse per line.
    """

    response = gemini_client.models.generate_content(
        model="gemini-2.5-flash",
        contents=f"{poem_extraction_prompt}\n\n{html}",
        config=types.GenerateContentConfig(
            response_mime_type="text/plain",
            temperature=0.1
        )
    )
    
    return response.text

Glue it together

In [3]:
def get_poem_from_url(url:str) -> str:
    html = get_html_from_url(url)
    return get_poem_from_html(html)

Now let's loop through all the urls and extract the poems.

In [6]:
import json
with open("data/ciklusi_scrape.json", "r") as f:
    ciklusi = json.load(f)

urls = []
for ciklus in ciklusi[0]["ciklus"]:
    for pesma in ciklus["pesme"]:
        urls.append(pesma["url"])

print(f"Found {len(urls)} urls")

Found 145 urls


In [7]:
from fastcore.parallel import parallel
tekstovi_pesama = parallel(get_poem_from_url, urls, n_workers=2, threadpool=True, progress=True)

In [10]:
from datetime import datetime

bad = 0
with open(f'./data/epske_pesme_hibrid_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.txt', 'w') as f:
    for tekst, url in zip(tekstovi_pesama, urls):
        if not tekst:
            bad += 1
            print(f"Bad extraction for {url}")
            output_file_path = os.path.join("./data/failed_hybrid", f"{bad}.txt")
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            with open(output_file_path, 'w') as ff:
                ff.write(url + '\n')
            continue
        
        f.write(tekst + '\n')

Bad extraction for https://sr.wikisource.org/wiki/%D0%9A%D0%BE_%D0%BA%D1%80%D1%81%D0%BD%D0%BE_%D0%B8%D0%BC%D0%B5_%D1%81%D0%BB%D0%B0%D0%B2%D0%B8,_%D0%BE%D0%BD%D0%BE%D0%BC_%D0%B8_%D0%BF%D0%BE%D0%BC%D0%B0%D0%B6%D0%B5
Bad extraction for https://sr.wikisource.org/wiki/%D0%82%D0%B0%D0%BA%D0%BE%D0%BD_%D0%A1%D1%82%D0%B5%D1%84%D0%B0%D0%BD_%D0%B8_%D0%B4%D0%B2%D0%B0_%D0%B0%D0%BD%D1%92%D0%B5%D0%BB%D0%B0
Bad extraction for https://sr.wikisource.org/wiki/%D0%9A%D1%83%D0%BC%D0%BE%D0%B2%D0%B0%D1%9A%D0%B5_%D0%93%D1%80%D1%87%D0%B8%D1%9B%D0%B0_%D0%9C%D0%B0%D0%BD%D0%BE%D1%98%D0%BB%D0%B0
Bad extraction for https://sr.wikisource.org/wiki/%D0%94%D1%83%D0%BD%D0%B0%D0%B2_%D1%81%D0%B5_%D0%A1%D0%B0%D0%B2%D0%BE%D0%BC_%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D0%BE
Bad extraction for https://sr.wikisource.org/wiki/%D0%91%D0%B0%D0%BD_%D0%9C%D0%B8%D0%BB%D1%83%D1%82%D0%B8%D0%BD_%D0%B8_%D0%94%D1%83%D0%BA%D0%B0_%D0%A5%D0%B5%D1%80%D1%86%D0%B5%D0%B3%D0%BE%D0%B2%D0%B0%D1%86
Bad extraction for https://sr.wikisource.org/wiki/%D0%94%D0

Those were added manually to `sve_srpske_epske_pesme.txt`

## Comparison with the Tiny Shakespeare dataset

Let's count how many characters the Tiny Shakespeare dataset contains.

In [2]:
tiny_shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

import requests
response = requests.get(tiny_shakespeare_url)
tiny_shakespeare_text = response.text

In [8]:
with open("data_final/sve_srpske_epske_pesme.txt", "r") as f:
    tekst_spojen = f.read()

In [5]:
def count_lines(text:str) -> int:
    lines = text.split("\n")
    lines = [line for line in lines if line.strip() != ""]
    return len(lines)

def count_chars(text:str) -> int:
    return len(text)

In [9]:
ts_lines = count_lines(tiny_shakespeare_text)
ts_chars = count_chars(tiny_shakespeare_text)

pesme_lines = count_lines(tekst_spojen)
pesme_chars = count_chars(tekst_spojen)

print(f"Tiny Shakespeare dataset contains {ts_lines} lines and {ts_chars} characters")
print(f"Sve srpske epske pesme dataset contains {pesme_lines} lines and {pesme_chars} characters")

# Calculate the size difference
ts_to_pesme_ratio = ts_chars / pesme_chars
pesme_to_ts_ratio = pesme_chars / ts_chars

print(f"\nSize comparison:")
print(f"Tiny Shakespeare is {ts_to_pesme_ratio:.2%} the size of Sve srpske epske pesme")
print(f"Sve srpske epske pesme is {pesme_to_ts_ratio:.1f}x larger than Tiny Shakespeare")

# Calculate absolute difference
char_difference = pesme_chars - ts_chars
print(f"Absolute difference: {char_difference:,} characters")


Tiny Shakespeare dataset contains 32777 lines and 1115394 characters
Sve srpske epske pesme dataset contains 26047 lines and 737054 characters

Size comparison:
Tiny Shakespeare is 151.33% the size of Sve srpske epske pesme
Sve srpske epske pesme is 0.7x larger than Tiny Shakespeare
Absolute difference: -378,340 characters
