In [1]:
import pandas as pd
from pathlib import Path

# --- CONFIG ---
DATA_DIR = Path(r"D:\Capstone_Staging\data\musicbrainz_raw\cleansed")

release_path = DATA_DIR / "release_enriched.tsv"
rg_path = DATA_DIR / "release_group.tsv"
ac_path = DATA_DIR / "artist_credit.tsv"

# --- LOAD SMALL SAMPLES ---
release_df = pd.read_csv(release_path, sep="\t", nrows=1000)
rg_df = pd.read_csv(rg_path, sep="\t", nrows=1000)
ac_df = pd.read_csv(ac_path, sep="\t", nrows=1000)

print("✅ Loaded sample shapes:")
print("   release_enriched:", release_df.shape)
print("   release_group:", rg_df.shape)
print("   artist_credit:", ac_df.shape)

# --- INSPECT column names + types ---
print("\n📋 release_enriched columns:", release_df.columns.tolist())
print("📋 release_group columns:", rg_df.columns.tolist())
print("📋 artist_credit columns:", ac_df.columns.tolist())

# --- SAMPLE DISTINCT KEYS ---
print("\n🔹 Sample release_enriched.artist_credit values:", release_df['artist_credit'].head().tolist())
print("🔹 Sample release_group.artist_credit values:", rg_df.iloc[:,3].head().tolist())  # usually 4th column
print("🔹 Sample artist_credit.id values:", ac_df.iloc[:,0].head().tolist())

# --- INTERSECT check ---
rel_ids = set(map(str, release_df['artist_credit'].dropna().astype(str)))
rg_ids = set(map(str, rg_df.iloc[:,3].dropna().astype(str)))   # release_group.artist_credit
ac_ids = set(map(str, ac_df.iloc[:,0].dropna().astype(str)))   # artist_credit.id

print("\n🔍 Key-space intersections:")
print(f"   release.artist_credit ∩ artist_credit.id → {len(rel_ids & ac_ids)}")
print(f"   release_group.artist_credit ∩ artist_credit.id → {len(rg_ids & ac_ids)}")
print(f"   release.artist_credit ∩ release_group.artist_credit → {len(rel_ids & rg_ids)}")

# --- SAMPLE join demonstration (first few matches) ---
join_preview = (
    rg_df[[rg_df.columns[0], rg_df.columns[1], rg_df.columns[3]]]
    .rename(columns={rg_df.columns[0]:'release_group_id', rg_df.columns[3]:'artist_credit'})
    .merge(ac_df[[ac_df.columns[0], ac_df.columns[1]]],
           left_on='artist_credit', right_on=ac_df.columns[0], how='inner')
    .head(5)
)
print("\n🧩 Example join (release_group → artist_credit.name):")
display(join_preview)


✅ Loaded sample shapes:
   release_enriched: (1000, 15)
   release_group: (1000, 8)
   artist_credit: (1000, 7)

📋 release_enriched columns: ['id', 'gid', 'name', 'artist_credit', 'release_group', 'status', 'packaging', 'language', 'script', 'barcode', 'comment', 'edits_pending', 'quality', 'last_updated', 'release_year']
📋 release_group columns: ['1964563', 'f59da930-70ba-4992-a346-7ed2d8e3cda8', 'Wande', '627364', '1', 'Unnamed: 5', '0', '2018-04-30 23:56:50.245482+00']
📋 artist_credit columns: ['4229350', 'Jean-Paul Fouchécourt, Yvonne Naef, Saito Kinen Orchestra, Seiji Ozawa', '4', '1', '2024-12-20 06:18:14.699053+00', '0', '25966362-45fb-4457-88c7-b0d9b06f28e6']

🔹 Sample release_enriched.artist_credit values: [60, 60, 1, 60, 20211]
🔹 Sample release_group.artist_credit values: [2966520, 11, 26, 44, 1671]
🔹 Sample artist_credit.id values: [3320885, 3532081, 3320887, 3431907, 3435757]

🔍 Key-space intersections:
   release.artist_credit ∩ artist_credit.id → 2
   release_group.artist

Unnamed: 0,release_group_id,f59da930-70ba-4992-a346-7ed2d8e3cda8,artist_credit,4229350,"Jean-Paul Fouchécourt, Yvonne Naef, Saito Kinen Orchestra, Seiji Ozawa"
0,1806611,027b8167-b14d-4c99-8b14-79f0fe315fa9,2001233,2001233,Sailing Conductors
1,3311801,68379e94-60db-475a-9d06-23e5ce68cd7c,3615899,3615899,Orchestra Simfonică A Radioteleviziunii
2,1908279,0a4b5d23-dd1a-40ca-95b2-b1d58683e1a4,2139392,2139392,URUK
3,3311804,361fe476-67a3-41b3-a2ad-9d67610f53a9,3615901,3615901,Inmarcesible
4,3705316,b68c0eef-7237-4c24-b120-89757db5681e,3971391,3971391,LBLVNC & Wønder


In [7]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"D:\Capstone_Staging\data\musicbrainz_raw\cleansed")

# --- LOAD TABLES ---
release = pd.read_csv(DATA_DIR / "release_enriched.tsv", sep="\t", low_memory=False, nrows=50000)
rg = pd.read_csv(DATA_DIR / "release_group.tsv", sep="\t", header=None, low_memory=False, nrows=50000)
ac = pd.read_csv(DATA_DIR / "artist_credit.tsv", sep="\t", header=None, low_memory=False, nrows=50000)

print("✅ Loaded samples:")
print(f"release_enriched: {release.shape}, release_group: {rg.shape}, artist_credit: {ac.shape}")

# --- Build lookup maps (positional) ---
# release_group: id = col0, gid = col1
rg_map = dict(zip(rg[0].astype(str), rg[1]))
# artist_credit: id = col0, gid = col6
ac_map = dict(zip(ac[0].astype(str), ac[6]))

# --- Apply mapping ---
release["release_group_gid"] = release["release_group"].astype(str).map(rg_map)
release["artist_credit_gid"] = release["artist_credit"].astype(str).map(ac_map)

# --- Coverage ---
coverage_rg = release["release_group_gid"].notna().mean() * 100
coverage_ac = release["artist_credit_gid"].notna().mean() * 100
print(f"\n📊 Mapping coverage:")
print(f"release_group_gid mapped: {coverage_rg:.1f}%")
print(f"artist_credit_gid mapped: {coverage_ac:.1f}%")

# --- Inspect sample records ---
print("\n🔎 Example of enriched rows:")
display(
    release.loc[
        release["release_group_gid"].notna() | release["artist_credit_gid"].notna(),
        ["id", "name", "artist_credit", "artist_credit_gid", "release_group", "release_group_gid"]
    ].head(10)
)

# --- Verify GUID overlaps with source files ---
valid_rg_gids = set(rg[1].astype(str))
mapped_rg_gids = set(release["release_group_gid"].dropna().astype(str))
print(f"\n🧩 release_group_gid overlap with release_group.gid: {len(valid_rg_gids & mapped_rg_gids)} / {len(mapped_rg_gids)}")

valid_ac_gids = set(ac[6].astype(str))
mapped_ac_gids = set(release["artist_credit_gid"].dropna().astype(str))
print(f"🧩 artist_credit_gid overlap with artist_credit.gid: {len(valid_ac_gids & mapped_ac_gids)} / {len(mapped_ac_gids)}")


✅ Loaded samples:
release_enriched: (50000, 15), release_group: (50000, 8), artist_credit: (50000, 7)

📊 Mapping coverage:
release_group_gid mapped: 14.5%
artist_credit_gid mapped: 1.6%

🔎 Example of enriched rows:


Unnamed: 0,id,name,artist_credit,artist_credit_gid,release_group,release_group_gid
2,3257193,Kriminaltango et al,1,,2823308,fd8f332c-a9cb-49d9-9b2f-a9508784d747
3,12,Silent All These Years,60,,104189,576a75de-b6c9-3ade-92ab-b7d93e46be21
4,26,Demons,20211,,94299,df295d32-f18f-333d-a94c-e168c6323a9a
8,1088433,Talking in the Streets,876468,,1106241,16662caf-7a15-4a31-a31a-c0a0e19445b6
9,71,Year 3000 (disc 2),59115,,101019,3e113bbb-d49a-35f4-bbef-14b212558f82
10,2415264,Café Tacuba,11630,,123217,d002fc69-7f3f-3c2a-9e5f-e76d607859d4
11,1966951,Songs for Marianne,2001233,1684be04-66f0-3253-ade3-f15cb259b468,1806611,027b8167-b14d-4c99-8b14-79f0fe315fa9
13,32,Juxtapozed With U,20211,,94306,a007e1da-0843-3c1e-86c0-3dde5e07eebd
18,4224833,Die Großen Deutschen Tanzorchester,3852209,,3574375,ec3594d7-4928-49e6-9c2c-930fa0d1dc74
19,57,Guerrilla,20211,,94127,5b7eddcc-51e7-326d-86f6-d702ad8975ff



🧩 release_group_gid overlap with release_group.gid: 6822 / 6822
🧩 artist_credit_gid overlap with artist_credit.gid: 538 / 538
