In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# URL of the webpage (Spotify Daily charts Viet Nam)
url = "https://kworb.net/spotify/country/vn_daily.html"

# Send a GET request to fetch the HTML content
response = requests.get(url)
response.encoding = 'utf-8'  # Ensure the content is decoded in UTF-8
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
    exit()

html_content = response.text

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table containing the data
table = soup.find('table')
if not table:
    print("Table not found on the page. Please check the page structure.")
    exit()

# Initialize a list to store data
data = []

# Initialize a counter for numerical order
num_order = 1

# Iterate through each row in the table (skipping the header row)
for row in table.find_all('tr')[1:]:  # Skip header row
    # Extract columns (name, days, T10, Pk, (x?), PkStreams, Total)
    columns = row.find_all('td')

    if len(columns) >= 7:
        try:
            # Extract name and artist from the third column (columns[2])
            name_artist = columns[2].text.strip()
            print(f"Raw name_artist: '{name_artist}'")  # Debugging line to see raw name_artist

            # Check if there is a ' - ' separating artist and track
            if " - " not in name_artist:
                print(f"Skipping row due to missing ' - ': {name_artist}")
                continue

            # Using regex to clean up possible extra spaces around the separator and split
            match = re.match(r"(.+) - (.+)", name_artist)
            if match:
                artist, name = match.groups()
                artist, name = artist.strip(), name.strip()

                # Extract track URL (validating presence of <a> tags)
                link_tags = columns[2].find_all('a')
                if len(link_tags) < 2:
                    print(f"Skipping row due to missing track URL: {row}")
                    continue

                track_url = link_tags[1]['href']
                track_id = track_url.split('/')[-1].split('.')[0]  # Extract ID

                # Append data as a tuple including numerical order
                data.append((num_order, name, artist, track_id))

                # Increment numerical order
                num_order += 1
            else:
                print(f"Skipping row due to unexpected format: {name_artist}")

        except IndexError as e:
            print(f"IndexError: {e} - Skipping row: {row}")

        except Exception as e:
            print(f"Error: {e} - Skipping row: {row}")

# Specify the CSV file path
csv_file = 'spotify_data_vn_daily_chart.csv'

# Write data to CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Order', 'Name', 'Artist', 'ID'])  # Write header
    writer.writerows(data)

print(f"CSV file '{csv_file}' with ordered data has been created successfully.")


Raw name_artist: 'Jung Kook - Seven (w/ Latto)'
Raw name_artist: 'Jimin - Who'
Raw name_artist: 'ROSÉ - number one girl'
Raw name_artist: 'Jin - Running Wild'
Raw name_artist: 'Jung Kook - 3D (w/ Jack Harlow)'
Raw name_artist: 'HIEUTHUHAI - Exit Sign (w/ marzuz)'
Raw name_artist: 'Dương Domic - Mất Kết Nối'
Raw name_artist: 'Jung Kook - Standing Next to You'
Raw name_artist: 'Vũ. - Bình Yên (w/ Binz)'
Raw name_artist: 'HIEUTHUHAI - Không Thể Say'
Raw name_artist: 'tlinh - PHÓNG ZÌN ZÌN (w/ Low G)'
Raw name_artist: 'ROSÉ - APT. (w/ Bruno Mars)'
Raw name_artist: 'MONO - Chăm Hoa'
Raw name_artist: 'Puppy - Wrong Times (w/ Dangrangto)'
Raw name_artist: 'ANH TRAI "SAY HI" - TRÀN BỘ NHỚ (w/ Dương Domic)'
Raw name_artist: 'Lady Gaga - Die With A Smile (w/ Bruno Mars)'
Raw name_artist: 'Da LAB - Bầu Trời Mới (w/ Minh Tốc & Lam)'
Raw name_artist: 'Sơn Tùng M-TP - Đừng Làm Trái Tim Anh Đau'
Raw name_artist: 'CoolKid - Sau Cơn Mưa (w/ RHYDER)'
Raw name_artist: 'MANBO - Hẹn Gặp Em Dưới Ánh Trăng (