In [15]:
from bs4 import BeautifulSoup
import pandas as pd
import re

MIX_TERMS = [
    "Original Mix", "Extended Mix", "Remix", "Dub Mix", "Version", "Radio Edit",
    "Edit", "Mix"
]

def normalize_title(raw):
    raw = raw.strip()
    for term in MIX_TERMS:
        # look for pattern "… <term>" at end
        if raw.lower().endswith(term.lower()):
            # split at last occurrence of term
            idx = raw.lower().rfind(term.lower())
            main = raw[:idx].strip()
            mix = raw[idx:].strip()
            return f"{main} - {mix}"
    return raw

def scrape_beatport_chart(html_path):
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    rows = soup.select("div[data-testid='tracks-table-row']")
    tracks = []

    for row in rows:
        # --- TITLE ---
        title_tag = row.select_one("span[class*='ReleaseName']")
        raw_title = title_tag.get_text(" ", strip=True) if title_tag else None
        title = normalize_title(raw_title) if raw_title else None

        # --- ARTISTS ---
        artist_tags = row.select("div[class*='ArtistNames'] a")
        artists = [a.get_text(strip=True) for a in artist_tags]

        # --- LABEL ---
        label_tag = row.select_one("div.cell.label a")
        label = label_tag.get_text(strip=True) if label_tag else None

        # --- GENRE / BPM / KEY ---
        bpm = None
        key = None
        genre = None
        bpm_cell = row.select_one("div.cell.bpm")
        if bpm_cell:
            genre_tag = bpm_cell.select_one("a")
            genre = genre_tag.get_text(strip=True) if genre_tag else None
            text = bpm_cell.get_text(" ", strip=True)
            bpm_match = re.search(r"(\d+)\s*BPM", text)
            if bpm_match:
                bpm = int(bpm_match.group(1))
            if "-" in text:
                key = text.split("-")[-1].strip()

        # --- RELEASE DATE ---
        date_tag = row.select_one("div.cell.date")
        release_date = date_tag.get_text(strip=True) if date_tag else None

        # --- PRICE ---
        price_tag = row.select_one("button[class*='PriceButton']")
        price = price_tag.get_text(strip=True) if price_tag else None

        tracks.append({
            "title": title,
            "artists": artists,
            "label": label,
            "genre": genre,
            "bpm": bpm,
            "key": key,
            "release_date": release_date,
            "price": price
        })

    return pd.DataFrame(tracks)

In [17]:
import requests

url = "https://www.beatport.com/chart/after-hour-essentials-2025-techno-rdh/841057"
html = requests.get(url).text

with open("beatport_chart_live.html", "w", encoding="utf-8") as f:
    f.write(html)

In [19]:
df = scrape_beatport_chart("test_beatport.html")
df.head()

Unnamed: 0,title,artists,label,genre,bpm,key,release_date,price
0,Old School - Original Mix,[Hertz],Sway,Techno (Raw / Deep / Hypnotic),142,Gb Minor,2025-10-10,€1.39
1,Captcha - Original Mix,[Glaskin],CROWD (DE),Techno (Raw / Deep / Hypnotic),140,G Major,2025-10-09,€1.39
2,Signal to Noise - Original Mix,[Efdemin],Ostgut Ton,Techno (Raw / Deep / Hypnotic),136,Bb Minor,2025-10-07,€1.39
3,Forge Ritual - Original Mix,[Procombo],Suara,Techno (Raw / Deep / Hypnotic),142,Gb Major,2025-10-17,€1.39
4,Second Skin - Original Mix,[Phara],Token,Techno (Raw / Deep / Hypnotic),138,F Minor,2025-10-10,€1.39


In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

url = "https://www.beatport.com/chart/after-hours-essentials-2025-trance-rdh/840953"

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get(url)
time.sleep(5)  # let JS load

html = driver.page_source

with open("beatport_chart_live.html", "w", encoding="utf-8") as f:
    f.write(html)

driver.quit()

In [24]:
df = scrape_beatport_chart("beatport_chart_live.html")
df.head()

Unnamed: 0,title,artists,label,genre,bpm,key,release_date,price
0,Ecstatic Pulse - Original Mix,[Pascal Hetzel],Metroplex,Trance (Raw / Deep / Hypnotic)|Raw Trance,141,F Minor,2025-07-18,€1.39
1,Unseen Between a Million Eyes - Original Mix,[Nomas],JOOF Recordings,Trance (Raw / Deep / Hypnotic)|Raw Trance,138,F Minor,2025-07-04,€1.39
2,Planet Unreal - Original Mix,[Bliss Inc],MAGICWIRE,Trance (Raw / Deep / Hypnotic)|Raw Trance,138,G Major,2025-06-20,€1.39
3,Choose Life - Original Mix,[Sohl],Record Union,Trance (Raw / Deep / Hypnotic),70,Bb Minor,2025-06-30,€1.39
4,Arrhythmia - Original Mix,"[Body Clinic, MITMISCHEN, Wigs]",MITMISCHEN,Trance (Raw / Deep / Hypnotic)|Raw Trance,138,F Major,2025-07-04,€1.39
