In [1]:
print("h")

h


In [2]:
from dotenv import load_dotenv
load_dotenv()
import os

MONGO_URI = os.getenv("MONGO_URI")

if not MONGO_URI:
    raise ValueError("Thiếu MONGO_URI trong file .env!")

DATABASE = os.getenv("MONGO_DATABASE", "spotify_db")



In [6]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure

def connect_mongodb():

    try:
        client = MongoClient(MONGO_URI) 

        client.admin.command('ping')
        print("✅ Kết nối đến MongoDB thành công!")

        db = client[DATABASE] 

        return db
    except ConnectionFailure:
        print("❌ Không thể kết nối đến MongoDB Server")
        return None
    except Exception as e:
        print(f"❌ Có lỗi xảy ra: {e}")
        return None


In [7]:
db = connect_mongodb()


✅ Kết nối đến MongoDB thành công!


In [None]:
# spark_jobs/loading/populate_mongodb.py
import os
import sys
from pathlib import Path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, struct
import traceback

# ================================
# 1. KHỞI TẠO SPARK + MONGO CONNECTOR
# ================================
print("Khởi tạo Spark với MongoDB Connector...")

project_root = Path(__file__).parent.parent.parent
jars_dir = project_root / "jars"

if not jars_dir.exists():
    raise FileNotFoundError(f"Thư mục không tồn tại: {jars_dir}")

jars_list = [str(p) for p in jars_dir.glob("*.jar")]

print(jars_list)
print("Sử dụng các JAR sau:")
# ================================
# KHỞI TẠO SPARK – DÙNG JAR TỪ THƯ MỤC
# ================================
spark = SparkSession.builder \
    .appName("Spotify → MongoDB Load") \
    .config("spark.jars", ",".join(jars_list)) \
    .config("spark.driver.memory", "4g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.multipart.purge.age", "86400") \
    .config("spark.hadoop.fs.s3a.multipart.purge", "true") \
    .config("spark.hadoop.fs.s3a.connection.timeout", "600000") \
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "600000") \
    .config("spark.hadoop.fs.s3a.attempts.maximum", "100") \
    .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60") \
    .config("spark.hadoop.fs.s3a.max.total.tasks", "100") \
    .config("spark.hadoop.fs.s3a.multipart.size", "104857600") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")
print("Spark + MongoDB Connector Ready!")

# ================================
# 2. CẤU HÌNH MONGODB (.env hoặc hardcode tạm)
# ================================
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env") 

MONGO_URI = os.getenv("MONGO_URI")

if not MONGO_URI:
    raise ValueError("Thiếu MONGO_URI trong file .env!")

DATABASE = os.getenv("MONGO_DATABASE", "spotify_db")

print(f"Kết nối MongoDB: {MONGO_URI.split('@')[-1].split('/')[0] if '@' in MONGO_URI else MONGO_URI}")
print(f"Database: {DATABASE}")

# ================================
# 3. ĐỌC DỮ LIỆU ĐÃ XỬ LÝ TỪ MINIO
# ================================
input_path = "s3a://spotify-processed-data/spotify_tracks" 

print(f"\nĐang đọc dữ liệu từ: {input_path}")
df = spark.read.parquet(input_path).cache()
print(df.head())
total_rows = df.count()
print(f"Đã đọc thành công {total_rows:,} dòng")
df.printSchema()

# ================================
# 4. GHI VÀO MONGODB – 4 COLLECTION RIÊNG + 1 EMBEDDED (TỐI ƯU NHẤT)
# ================================

def write_mongo(dataframe, collection, mode="overwrite"):
    print(f"→ Ghi {dataframe.count():,} bản ghi vào collection: {collection}")
    (dataframe.write
     .format("mongodb")
     .option("uri", MONGO_URI)
     .option("database", DATABASE)
     .option("collection", collection)
     .mode(mode)
     .save())
    print(f"   Đã ghi xong {collection}")

# 1. Artists
artists = df.select("artist_uri", "artist_name").distinct() \
    .withColumnRenamed("artist_uri", "_id") \
    .withColumnRenamed("artist_name", "name")
write_mongo(artists, "artists")

# 2. Albums
albums = df.select("album_uri", "album_name").distinct() \
    .withColumnRenamed("album_uri", "_id") \
    .withColumnRenamed("album_name", "name")
write_mongo(albums, "albums")

# 3. Tracks
tracks = df.select(
    "track_uri", "track_name", "artist_uri", "album_uri", "duration_ms"
).distinct() \
    .withColumnRenamed("track_uri", "_id") \
    .withColumnRenamed("track_name", "name")
write_mongo(tracks, "tracks")

# 4. Playlists + Tracks (Embedded – TỐI ƯU CHO MONGODB)
print("\nTạo collection playlists với tracks được nhúng (embedded) – cách dùng tốt nhất với MongoDB")
playlists_embedded = df.groupBy("pid", "playlist_name") \
    .agg(collect_list(struct(
        col("track_uri"),
        col("track_name").alias("name"),
        col("artist_uri"),
        col("artist_name"),
        col("duration_ms")
    )).alias("tracks")) \
    .withColumnRenamed("pid", "_id") \
    .withColumnRenamed("playlist_name", "name")

playlists_embedded.printSchema()
write_mongo(playlists_embedded, "playlists", mode="overwrite")

# ================================
# 5. HOÀN TẤT
# ================================
df.unpersist()

print("\n" + "="*80)
print("TẤT CẢ DỮ LIỆU ĐÃ ĐƯỢC ĐỔ VÀO MONGODB THÀNH CÔNG!")
print("="*80)
print("Các collection đã tạo:")
print("   • artists")
print("   • albums")
print("   • tracks")
print("   • playlists  ← (có nhúng toàn bộ tracks → query cực nhanh!)")
print("="*80)

spark.stop()

NameError: name '__file__' is not defined