# this notebook is one time use only because we didnt have working downloading of the songs from the beginning  

In [1]:
import csv
import psycopg2
from psycopg2 import sql
from datetime import datetime
import logging
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

# --- Database Connection Parameters ---
DB_NAME = os.getenv("DB_NAME", "database-instance")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT", "5432")

# --- CSV File Path ---
CSV_FILE_PATH = "../data/top_songs_curated.csv"

In [3]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [4]:
def get_db_connection():
    """Establishes and returns a database connection."""
    logging.info(
        f"Attempting to connect to database '{DB_NAME}' on {DB_HOST}:{DB_PORT}..."
    )
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            host=DB_HOST,
            port=DB_PORT,
        )
        logging.info("Successfully connected to the database.")
        return conn
    except psycopg2.OperationalError as e:
        logging.error(f"Database connection failed: {e}")
        raise

In [5]:
def update_youtube_data_from_csv(conn, csv_filepath):
    """
    Reads the CSV file and updates youtube_title and youtube_url in the tracks table.

    Args:
        conn: A psycopg2 database connection object.
        csv_filepath (str): The path to the CSV file.
    """
    logging.info(f"Starting YouTube data update process from CSV: {csv_filepath}")
    processed_rows = 0
    updated_db_rows = 0
    skipped_rows_no_uri = 0
    skipped_rows_no_youtube_data = 0

    # Expected columns in CSV for this update
    required_csv_cols = ["Track URI", "youtube_title", "youtube_url"]

    try:
        with open(csv_filepath, mode="r", encoding="utf-8") as csvfile:
            csv_reader = csv.DictReader(csvfile)

            header = csv_reader.fieldnames
            if not header:
                logging.error("CSV file is empty or has no header.")
                return

            missing_cols = [col for col in required_csv_cols if col not in header]
            if missing_cols:
                logging.error(
                    f"Missing required columns in CSV header for update: {', '.join(missing_cols)}"
                )
                logging.info(
                    f"Available columns: {header}. Ensure 'Track URI', 'youtube_title', and 'youtube_url' are present."
                )
                return

            with conn.cursor() as cursor:
                for row_num, row in enumerate(csv_reader, 1):
                    processed_rows += 1
                    original_track_uri = row.get("Track URI")

                    # CSV values, default to None if column is present but value is empty string
                    # Treat empty strings from CSV as "no value provided for this field"
                    csv_youtube_title = (
                        row.get("youtube_title")
                        if row.get("youtube_title", "").strip()
                        else None
                    )
                    csv_youtube_url = (
                        row.get("youtube_url")
                        if row.get("youtube_url", "").strip()
                        else None
                    )

                    if not original_track_uri:
                        logging.warning(
                            f"Skipping row {row_num} due to missing 'Track URI'."
                        )
                        skipped_rows_no_uri += 1
                        continue

                    set_clauses = []
                    update_values = []

                    if csv_youtube_title:
                        set_clauses.append(sql.SQL("youtube_title = %s"))
                        update_values.append(csv_youtube_title)

                    if csv_youtube_url:
                        set_clauses.append(sql.SQL("youtube_url = %s"))
                        update_values.append(csv_youtube_url)

                    if not set_clauses:
                        # logging.info(f"Row {row_num} (URI: {original_track_uri}): No new YouTube data provided in CSV. Skipping update for this row.")
                        skipped_rows_no_youtube_data += 1
                        continue

                    update_values.append(original_track_uri)  # For the WHERE clause

                    try:
                        # Construct the dynamic UPDATE query
                        # Example: UPDATE tracks SET youtube_title = %s, youtube_url = %s WHERE original_track_uri = %s;
                        # Or:      UPDATE tracks SET youtube_title = %s WHERE original_track_uri = %s;
                        query = sql.SQL(
                            "UPDATE tracks SET {} WHERE original_track_uri = %s;"
                        ).format(sql.SQL(", ").join(set_clauses))

                        # logging.debug(f"Executing query: {cursor.mogrify(query, tuple(update_values)).decode('utf-8')}")
                        cursor.execute(query, tuple(update_values))

                        if cursor.rowcount > 0:
                            updated_db_rows += 1
                            logging.info(
                                f"Updated track (URI: {original_track_uri}) with new YouTube data. Fields updated: {len(set_clauses)}."
                            )
                        # else:
                        # logging.info(f"Track (URI: {original_track_uri}) not found in database or no changes made.")

                    except psycopg2.Error as db_err:
                        logging.error(
                            f"Database error updating row {row_num} (Track URI: {original_track_uri}): {db_err}"
                        )
                        conn.rollback()  # Rollback this specific transaction if an error occurs
                        # Or, collect errors and decide on a strategy (e.g., skip, retry)
                        # To continue processing other rows, you might re-raise or handle differently
                    except Exception as e:
                        logging.error(
                            f"General error processing row {row_num} (Track URI: {original_track_uri}): {e}"
                        )
                        conn.rollback()

                    if row_num % 100 == 0:  # Log progress and commit periodically
                        logging.info(
                            f"Processed {processed_rows} rows from CSV. Updated {updated_db_rows} DB rows so far."
                        )
                        conn.commit()  # Commit changes in batches

                conn.commit()  # Final commit for any remaining operations
                logging.info("YouTube data update process completed.")
                logging.info(f"Summary: Processed {processed_rows} CSV rows.")
                logging.info(f"Database rows updated: {updated_db_rows}.")
                logging.info(f"CSV rows skipped (no Track URI): {skipped_rows_no_uri}.")
                logging.info(
                    f"CSV rows skipped (no new YouTube data): {skipped_rows_no_youtube_data}."
                )

    except FileNotFoundError:
        logging.error(f"The file '{csv_filepath}' was not found.")
    except psycopg2.Error as e:
        logging.error(f"A database error occurred during update setup: {e}")
        if conn:
            conn.rollback()
    except Exception as e:
        logging.error(f"An unexpected error occurred during the update process: {e}")
        if conn:
            conn.rollback()

In [6]:
logging.info("Starting Python ETL script...")

# Validate essential configurations
if DB_NAME == "your_db_name" or DB_USER == "your_db_user":
    logging.error(
        "CRITICAL: Default database credentials are still in use. Please update DB_NAME, DB_USER, and DB_PASSWORD."
    )
    exit(1)
if CSV_FILE_PATH == "your_spotify_data.csv" and not os.path.exists(CSV_FILE_PATH):
    logging.warning(
        f"Default CSV_FILE_PATH '{CSV_FILE_PATH}' is set. Make sure this file exists or update the path."
    )

connection = None
try:
    connection = get_db_connection()
    if connection:
        update_youtube_data_from_csv(connection, CSV_FILE_PATH)
except Exception as e:
    logging.critical(f"ETL process failed critically: {e}")
finally:
    if connection:
        connection.close()
        logging.info("Database connection closed.")
logging.info("Python ETL script finished.")

2025-06-04 21:25:31,679 - INFO - Starting Python ETL script...
2025-06-04 21:25:31,680 - INFO - Attempting to connect to database 'database-instance' on 34.140.62.43:5432...
2025-06-04 21:25:32,299 - INFO - Successfully connected to the database.
2025-06-04 21:25:32,300 - INFO - Starting YouTube data update process from CSV: ../data/top_songs_curated.csv
2025-06-04 21:25:32,452 - INFO - Updated track (URI: spotify:track:7iL6o9tox1zgHpKUfh9vuC) with new YouTube data. Fields updated: 2.
2025-06-04 21:25:32,485 - INFO - Updated track (URI: spotify:track:7xp7FkbxGEWcVDvL1KayoD) with new YouTube data. Fields updated: 2.
2025-06-04 21:25:32,519 - INFO - Updated track (URI: spotify:track:0x7U32vZzq7e1qVpA0MBwK) with new YouTube data. Fields updated: 2.
2025-06-04 21:25:32,553 - INFO - Updated track (URI: spotify:track:3G0yz3DZn3lfraledmBCT0) with new YouTube data. Fields updated: 2.
2025-06-04 21:25:32,601 - INFO - Updated track (URI: spotify:track:1N9JazWqQEGOtcDfL0IAaK) with new YouTube dat