In [5]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"D:\Capstone_Staging\data")
joined_path = DATA_DIR / "joined_release_data.tsv"

df = pd.read_csv(joined_path, sep="\t", low_memory=False)

print(f"✅ Loaded joined_release_data.tsv: {df.shape[0]:,} rows, {df.shape[1]} columns\n")
print("📋 Columns available:")
print(df.columns.tolist())


✅ Loaded joined_release_data.tsv: 1,000,000 rows, 21 columns

📋 Columns available:
['id', 'gid', 'name', 'artist_credit', 'release_group', 'status', 'packaging', 'language', 'script', 'barcode', 'comment', 'edits_pending', 'quality', 'last_updated', 'release_year', 'release_group_gid', 'artist_credit_gid', 'release_group_name', 'artist_credit_name', 'secondary_types', 'is_soundtrack']


In [9]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"D:\Capstone_Staging\data")
joined_path = DATA_DIR / "joined_release_data.tsv"

df = pd.read_csv(joined_path, sep="\t", low_memory=False)
print(f"✅ Loaded joined_release_data.tsv: {df.shape[0]:,} rows, {df.shape[1]} columns")

print("\n📋 Actual column names:")
for i, c in enumerate(df.columns):
    print(f"{i:02d}: {c}")


✅ Loaded joined_release_data.tsv: 1,000,000 rows, 21 columns

📋 Actual column names:
00: id
01: gid
02: name
03: artist_credit
04: release_group
05: status
06: packaging
07: language
08: script
09: barcode
10: comment
11: edits_pending
12: quality
13: last_updated
14: release_year
15: release_group_gid
16: artist_credit_gid
17: release_group_name
18: artist_credit_name
19: secondary_types
20: is_soundtrack


In [11]:
cols_with_soundtrack = []
for col in df.columns:
    try:
        if df[col].astype(str).str.contains("soundtrack", case=False, na=False).any():
            cols_with_soundtrack.append(col)
    except Exception:
        pass

print("🔎 Columns containing 'soundtrack':", cols_with_soundtrack)


🔎 Columns containing 'soundtrack': ['name', 'comment', 'release_group_name', 'artist_credit_name', 'secondary_types']


In [None]:
soundtrack_col = "release_group_secondary_type_join"  # replace with whatever your scan finds

soundtracks = df[df[soundtrack_col].astype(str).str.contains("soundtrack", case=False, na=False)]
print(f"🎬 Soundtrack rows: {len(soundtracks):,} ({len(soundtracks)/len(df):.2%} of total)")
soundtracks[["id", "name", "artist", soundtrack_col]].head(10)


In [13]:
# Select the column that reliably marks soundtrack releases
soundtrack_col = "secondary_types"

# Filter for soundtrack rows
soundtracks = df[df[soundtrack_col].astype(str).str.contains("soundtrack", case=False, na=False)]
pct = len(soundtracks) / len(df) * 100

print(f"🎬 Soundtrack rows: {len(soundtracks):,} ({pct:.2f}% of total)")

# Quick peek at relevant fields
cols_to_show = [c for c in ["id", "name", "release_group_name", "artist_credit_name", "comment", soundtrack_col] if c in df.columns]
soundtracks[cols_to_show].head(10)


🎬 Soundtrack rows: 20,260 (2.03% of total)


Unnamed: 0,id,name,release_group_name,artist_credit_name,comment,secondary_types
85,120,Storytelling,Storytelling,(unknown artist),,soundtrack
87,156,Terminator 3: Rise of the Machines,,(unknown artist),,soundtrack
102,3079425,The Nevers: Season 1 (Soundtrack from the HBO®...,The Nevers: Season 1 (Soundtrack from the HBO®...,(unknown artist),,soundtrack
104,100,Yellow Submarine,,(unknown artist),mono,soundtrack
145,1542004,コズミック・ファンタジー3冒険少年レイ SPECIAL EDITION,コズミック・ファンタジー3冒険少年レイ SPECIAL EDITION,(unknown artist),,soundtrack
172,2729667,Spellforce 2: Demons of the Past,Spellforce 2: Demons of the Past,(unknown artist),,soundtrack
175,270,Magical Mystery Tour,,(unknown artist),,soundtrack
199,274,Yellow Submarine,,(unknown artist),,soundtrack
277,366,The End Is the Beginning Is the End: The Remixes,The End Is the Beginning Is the End,The Smashing Pumpkins,,soundtrack
481,1541446,甘城ブリリアントパーク オリジナル・サウンド・トラック,甘城ブリリアントパーク オリジナル・サウンド・トラック,(unknown artist),24bit/48kHz,soundtrack


In [15]:
# Check missingness across the key textual columns
for col in ["name", "release_group_name", "artist_credit_name", "comment"]:
    nulls = soundtracks[col].isna().sum()
    empties = (soundtracks[col].astype(str).str.strip() == "").sum()
    print(f"{col:20s}: nulls={nulls:,} | empties={empties:,} | missing={(nulls+empties)/len(soundtracks):.1%}")


name                : nulls=0 | empties=0 | missing=0.0%
release_group_name  : nulls=6,894 | empties=0 | missing=34.0%
artist_credit_name  : nulls=0 | empties=0 | missing=0.0%
comment             : nulls=19,117 | empties=0 | missing=94.4%


In [17]:
unknown_rate = (soundtracks["artist_credit_name"].str.lower() == "(unknown artist)").mean() * 100
print(f"⚠️ (unknown artist) entries: {unknown_rate:.1f}%")

known_artists = soundtracks.loc[
    soundtracks["artist_credit_name"].str.lower() != "(unknown artist)",
    "artist_credit_name"
]
print("\n🎧 Example known artist names:")
print(known_artists.sample(min(10, len(known_artists)), random_state=42).tolist())


⚠️ (unknown artist) entries: 78.3%

🎧 Example known artist names:
['D. Imman', 'The Weeknd with Playboi Carti & Madonna', 'Kid Loco', 'Ichiro Shimakura', 'Pablo Vega', 'Shirley Walker', 'Shankar-Ehsaan-Loy', 'Pablo Malaurie', 'Himesh Reshammiya', 'Jakes Bejoy']


In [23]:
soundtracks.loc[:, "name_len"] = soundtracks["name"].astype(str).str.len()
soundtracks.loc[:, "group_len"] = soundtracks["release_group_name"].astype(str).str.len()

print("📏 Average title lengths:")
print(soundtracks[["name_len", "group_len"]].mean())

print("\n🔎 Sample titles:")
print(soundtracks["name"].sample(10, random_state=99).tolist())


📏 Average title lengths:
name_len     24.045114
group_len    15.230257
dtype: float64

🔎 Sample titles:
['Wir sind die Nacht', 'New Kids Turbo', 'COWBOY BEBOP: Vitaminless', "Eve's Bayou", 'Official Battle Superbreaks: Battle of the Year', 'Loose Change', 'The Beekeeper', 'King’s Ransom', 'Peter Pan: The British Musical (1994 original London Cast)', 'Ergo Proxy Original Soundtrack opus 02']


In [27]:
import re
import unicodedata

def normalize_title(s):
    """
    Improved normalization:
    - Lowercase
    - Keeps Unicode (Japanese, accented Latin)
    - Strips brackets, quotes, redundant punctuation
    - Preserves key soundtrack words
    """
    if pd.isna(s):
        return ""
    s = str(s).lower()

    # Normalize Unicode (preserve accents, kana)
    s = unicodedata.normalize("NFKC", s)

    # Remove bracketed metadata (e.g., [OST], (Deluxe Edition))
    s = re.sub(r'\[[^\]]*\]', '', s)
    s = re.sub(r'\([^)]*\)', '', s)

    # Keep alphanumerics + Unicode letters
    s = re.sub(r"[^0-9\w\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\s]", " ", s)

    # Compress whitespace
    s = re.sub(r"\s+", " ", s).strip()

    return s

soundtracks.loc[:, "name_clean"] = soundtracks["name"].map(normalize_title)
soundtracks.loc[:, "group_clean"] = soundtracks["release_group_name"].map(normalize_title)

print("\n🧩 Example cleaned titles (gentler normalization):")
print(soundtracks[["name", "name_clean", "group_clean"]].head(10))



🧩 Example cleaned titles (gentler normalization):
                                                  name  \
85                                        Storytelling   
87                  Terminator 3: Rise of the Machines   
102  The Nevers: Season 1 (Soundtrack from the HBO®...   
104                                   Yellow Submarine   
145                コズミック・ファンタジー3冒険少年レイ SPECIAL EDITION   
172                   Spellforce 2: Demons of the Past   
175                               Magical Mystery Tour   
199                                   Yellow Submarine   
277   The End Is the Beginning Is the End: The Remixes   
481                        甘城ブリリアントパーク オリジナル・サウンド・トラック   

                                          name_clean  \
85                                      storytelling   
87                 terminator 3 rise of the machines   
102                              the nevers season 1   
104                                 yellow submarine   
145              コズミック・ファンタジー3