## Process All LightGBM Prediction Files

Add original_song_id, original_msno, artist_name, and artist_gender to all LightGBM prediction files.


In [6]:
# Process all LightGBM prediction files
import pandas as pd
import numpy as np
import os
import glob

print("="*60)
print("PROCESSING ALL LIGHTGBM PREDICTION FILES")
print("="*60)

# Get all CSV files in temp_lgb folder (excluding summary files)
lgb_files = glob.glob('temp_lgb/*.csv')
lgb_files = [f for f in lgb_files if 'summary' not in os.path.basename(f)]
lgb_files.sort()

print(f"\nFound {len(lgb_files)} LightGBM prediction files:")
for f in lgb_files:
    print(f"  - {os.path.basename(f)}")

# Create output directory
output_dir = 'lgb_mapped'
os.makedirs(output_dir, exist_ok=True)
print(f"\nOutput directory: {output_dir}")


PROCESSING ALL LIGHTGBM PREDICTION FILES

Found 10 LightGBM prediction files:
  - lgb_0.85401_seed85.csv
  - lgb_0.85414_seed121.csv
  - lgb_0.85414_seed25.csv
  - lgb_0.85420_seed61.csv
  - lgb_0.85420_seed73.csv
  - lgb_0.85421_seed37.csv
  - lgb_0.85423_seed97.csv
  - lgb_0.85443_seed49.csv
  - lgb_0.85476_seed109.csv
  - lgb_0.85502_seed13.csv

Output directory: lgb_mapped


In [7]:
# Reconstruct the song_id and msno mappings
# The label encoder was fit on train['song_id'] and test['song_id'] from source_data
# We need to get the unique song_ids and msnos in the order they were encoded

print("\n" + "="*60)
print("RECONSTRUCTING LABEL ENCODER MAPPINGS")
print("="*60)

print("\nLoading source data to reconstruct mappings...")
train_source = pd.read_csv('input/training/source_data/train.csv')
test_source = pd.read_csv('input/training/source_data/test.csv')

# Get all unique song_ids in the order they appear (LabelEncoder sorts unique values)
all_song_ids = pd.concat([train_source['song_id'], test_source['song_id']]).unique()
all_song_ids_sorted = sorted(all_song_ids)

# Create mapping: encoded_id -> original_song_id
song_id_mapping = pd.DataFrame({
    'encoded_song_id': range(len(all_song_ids_sorted)),
    'original_song_id': all_song_ids_sorted
})

print(f"Created song_id mapping: {len(song_id_mapping):,} unique songs")

# Get all unique msnos
all_msnos = pd.concat([train_source['msno'], test_source['msno']]).unique()
all_msnos_sorted = sorted(all_msnos)

# Create mapping: encoded_msno -> original_msno
msno_mapping = pd.DataFrame({
    'encoded_msno': range(len(all_msnos_sorted)),
    'original_msno': all_msnos_sorted
})

print(f"Created msno mapping: {len(msno_mapping):,} unique members")
print("="*60)



RECONSTRUCTING LABEL ENCODER MAPPINGS

Loading source data to reconstruct mappings...
Created song_id mapping: 419,839 unique songs
Created msno mapping: 34,403 unique members


In [8]:
# Load songs and artists data for mapping
print("\nLoading songs and artists data...")
songs_original = pd.read_csv('input/training/source_data/songs.csv')
artists_original = pd.read_csv('input/training/source_data/artists.csv')

print(f"Loaded songs.csv: {len(songs_original):,} rows")
print(f"Loaded artists.csv: {len(artists_original):,} rows")

# Create a mapping from original_song_id to artist_name and gender
# First, join songs with artists
songs_with_artists = songs_original[['song_id', 'artist_name']].merge(
    artists_original[['artist_name', 'gender']],
    on='artist_name',
    how='left',
    suffixes=('', '_artist')
)

# Rename gender column to avoid confusion
songs_with_artists = songs_with_artists.rename(columns={'gender': 'artist_gender'})

print(f"Songs with artist info: {songs_with_artists['artist_gender'].notna().sum():,} / {len(songs_with_artists):,}")
print(f"Sample:\n{songs_with_artists.head()}")



Loading songs and artists data...
Loaded songs.csv: 2,296,320 rows
Loaded artists.csv: 40,582 rows
Songs with artist info: 812,557 / 2,296,320
Sample:
                                        song_id       artist_name  \
0  CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=  張信哲 (Jeff Chang)   
1  o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=         BLACKPINK   
2  DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=      SUPER JUNIOR   
3  dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=             S.H.E   
4  W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=              貴族精選   

  artist_gender  
0          Male  
1           NaN  
2           NaN  
3           NaN  
4          Male  


In [None]:
# Process each LightGBM file
print("\n" + "="*60)
print("PROCESSING FILES")
print("="*60)

processed_count = 0
for file_path in lgb_files:
    filename = os.path.basename(file_path)
    print(f"\nProcessing: {filename}")
    
    # Load the prediction file
    df_lgb = pd.read_csv(file_path)
    print(f"  Loaded: {len(df_lgb):,} rows")
    
    # Map encoded song_id to original_song_id
    df_lgb_mapped = df_lgb.merge(
        song_id_mapping,
        left_on='song_id',
        right_on='encoded_song_id',
        how='left'
    )
    
    # Map encoded msno to original_msno
    df_lgb_mapped = df_lgb_mapped.merge(
        msno_mapping,
        left_on='msno',
        right_on='encoded_msno',
        how='left'
    )
    
    # Add artist_name and gender using original_song_id
    df_lgb_mapped = df_lgb_mapped.merge(
        songs_with_artists[['song_id', 'artist_name', 'artist_gender']],
        left_on='original_song_id',
        right_on='song_id',
        how='left',
        suffixes=('', '_song')
    )
    
    # Drop the intermediate columns we don't need
    columns_to_drop = ['encoded_song_id', 'encoded_msno', 'song_id_song', 
                       'original_index', 'song_id', 'msno']
    df_lgb_mapped = df_lgb_mapped.drop(columns=[col for col in columns_to_drop if col in df_lgb_mapped.columns])
    
    other_cols = [col for col in df_lgb_mapped.columns 
                  if col not in ['prediction', 'ground_truth_target']]
    column_order = other_cols + ['prediction', 'ground_truth_target']
    df_lgb_mapped = df_lgb_mapped[column_order]
    
    # Save to output directory
    output_path = os.path.join(output_dir, filename)
    df_lgb_mapped.to_csv(output_path, index=False)
    
    # Print summary
    mapped_songs = df_lgb_mapped['original_song_id'].notna().sum()
    mapped_members = df_lgb_mapped['original_msno'].notna().sum()
    mapped_artists = df_lgb_mapped['artist_name'].notna().sum()
    mapped_gender = df_lgb_mapped['artist_gender'].notna().sum()
    
    print(f"  ✓ Saved to: {output_path}")
    print(f"    - Original song_id mapped: {mapped_songs:,} / {len(df_lgb_mapped):,} ({mapped_songs/len(df_lgb_mapped)*100:.1f}%)")
    print(f"    - Original msno mapped: {mapped_members:,} / {len(df_lgb_mapped):,} ({mapped_members/len(df_lgb_mapped)*100:.1f}%)")
    print(f"    - Artist name mapped: {mapped_artists:,} / {len(df_lgb_mapped):,} ({mapped_artists/len(df_lgb_mapped)*100:.1f}%)")
    print(f"    - Artist gender mapped: {mapped_gender:,} / {len(df_lgb_mapped):,} ({mapped_gender/len(df_lgb_mapped)*100:.1f}%)")
    
    processed_count += 1

print("\n" + "="*60)
print(f"COMPLETE: Processed {processed_count} files")
print(f"Output directory: {output_dir}/")
print("="*60)



PROCESSING FILES

Processing: lgb_0.85401_seed85.csv
  Loaded: 1,411,395 rows
  ✓ Saved to: lgb_mapped/lgb_0.85401_seed85.csv
    - Original song_id mapped: 1,411,395 / 1,411,395 (100.0%)
    - Original msno mapped: 1,411,395 / 1,411,395 (100.0%)
    - Artist name mapped: 1,411,372 / 1,411,395 (100.0%)
    - Artist gender mapped: 954,163 / 1,411,395 (67.6%)

Processing: lgb_0.85414_seed121.csv
  Loaded: 1,411,395 rows
  ✓ Saved to: lgb_mapped/lgb_0.85414_seed121.csv
    - Original song_id mapped: 1,411,395 / 1,411,395 (100.0%)
    - Original msno mapped: 1,411,395 / 1,411,395 (100.0%)
    - Artist name mapped: 1,411,376 / 1,411,395 (100.0%)
    - Artist gender mapped: 954,331 / 1,411,395 (67.6%)

Processing: lgb_0.85414_seed25.csv
  Loaded: 1,411,395 rows
  ✓ Saved to: lgb_mapped/lgb_0.85414_seed25.csv
    - Original song_id mapped: 1,411,395 / 1,411,395 (100.0%)
    - Original msno mapped: 1,411,395 / 1,411,395 (100.0%)
    - Artist name mapped: 1,411,375 / 1,411,395 (100.0%)
    - A

In [10]:
# Verify one of the processed files
print("\nVerifying processed file...")
sample_file = os.path.join(output_dir, os.path.basename(lgb_files[0]))
df_sample = pd.read_csv(sample_file)
print(f"\nSample file: {os.path.basename(sample_file)}")
print(f"Columns: {df_sample.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_sample[['id', 'original_song_id', 'original_msno', 'artist_name', 'artist_gender', 'prediction', 'ground_truth_target']].head(10))



Verifying processed file...

Sample file: lgb_0.85401_seed85.csv
Columns: ['id', 'prediction', 'ground_truth_target', 'original_song_id', 'original_msno', 'artist_name', 'artist_gender']

First few rows:
   id                              original_song_id  \
0   0  qD/QQKjFB3am2ZYMF3fNgOIPrIuUezHVQobMJk5VduY=   
1   1  beJIbbUTXpeZGLp4A8xLGRCAH6vf8EG3qGGSkJ2xtpM=   
2   2  auorhKXu9I04OVrLaDysoWEY/XSjmEcdgBRWHrLbbX8=   
3   3  3ZVFKlYjOLiyLGW2iuhN/Ca+E11w/OB7z1YFkUdnArA=   
4   4  D1tFsBLd9VWbonfb6Vek0BI2EJB6udLOuV/x+ptYpOI=   
5   5  HjD0DJh8lSSclFWoxwslnYwZkNeqMsxE+A2RqXpM8OQ=   
6   6  M+GLP5Bp1uC2RPh3lICfwnaf4l1qTboG4piMZsSB46M=   
7   7  BYBO2FYmzfeyuGeIV57iifEhQ4l3FjmYj2kzkIwubRE=   
8   8  MBCMb17fx7JWgOxzuAmB8CjSHp1xhBZJD56eqr2thiw=   
9   9  dDYizzhBq3x9uVKTsiM2FTzDMAmiR+8AsmVrWG7aOOM=   

                                  original_msno        artist_name  \
0  Dp/J1U1bEvNzB6OEKLtrJZSfJNedtuQh7/ZgjmZdjgM=   戴愛玲 (Ailing Tai)   
1  4m5Sn66p8UPrX7DJkBD5xfdHePZEI9Nui6iDtSr5kno=  

## Process All Neural Network Prediction Files

Add original_song_id, original_msno, artist_name, and artist_gender to all NN prediction files.


In [None]:
# Process all Neural Network prediction files
print("\n" + "="*60)
print("PROCESSING ALL NEURAL NETWORK PREDICTION FILES")
print("="*60)

# Get all CSV files in temp_nn folder (excluding summary files and models directory)
nn_files = glob.glob('temp_nn/*.csv')
nn_files = [f for f in nn_files if 'summary' not in os.path.basename(f)]
nn_files.sort()

print(f"\nFound {len(nn_files)} Neural Network prediction files:")
for f in nn_files:
    print(f"  - {os.path.basename(f)}")

# Create output directory
output_dir_nn = 'nn_mapped'
os.makedirs(output_dir_nn, exist_ok=True)
print(f"\nOutput directory: {output_dir_nn}")


In [None]:
# Process each Neural Network file
print("\n" + "="*60)
print("PROCESSING NN FILES")
print("="*60)

processed_count_nn = 0
for file_path in nn_files:
    filename = os.path.basename(file_path)
    print(f"\nProcessing: {filename}")
    
    # Load the prediction file
    df_nn = pd.read_csv(file_path)
    print(f"  Loaded: {len(df_nn):,} rows")
    
    # Map encoded song_id to original_song_id
    df_nn_mapped = df_nn.merge(
        song_id_mapping,
        left_on='song_id',
        right_on='encoded_song_id',
        how='left'
    )
    
    # Map encoded msno to original_msno
    df_nn_mapped = df_nn_mapped.merge(
        msno_mapping,
        left_on='msno',
        right_on='encoded_msno',
        how='left'
    )
    
    # Add artist_name and gender using original_song_id
    df_nn_mapped = df_nn_mapped.merge(
        songs_with_artists[['song_id', 'artist_name', 'artist_gender']],
        left_on='original_song_id',
        right_on='song_id',
        how='left',
        suffixes=('', '_song')
    )
    
    # Drop the intermediate columns we don't need
    columns_to_drop = ['encoded_song_id', 'encoded_msno', 'song_id_song', 
                       'original_index', 'song_id', 'msno']
    df_nn_mapped = df_nn_mapped.drop(columns=[col for col in columns_to_drop if col in df_nn_mapped.columns])
    
    # Reorder columns: put prediction and ground_truth_target last
    other_cols = [col for col in df_nn_mapped.columns 
                  if col not in ['prediction', 'ground_truth_target']]
    column_order = other_cols + ['prediction', 'ground_truth_target']
    df_nn_mapped = df_nn_mapped[column_order]
    
    # Save to output directory
    output_path = os.path.join(output_dir_nn, filename)
    df_nn_mapped.to_csv(output_path, index=False)
    
    # Print summary
    mapped_songs = df_nn_mapped['original_song_id'].notna().sum()
    mapped_members = df_nn_mapped['original_msno'].notna().sum()
    mapped_artists = df_nn_mapped['artist_name'].notna().sum()
    mapped_gender = df_nn_mapped['artist_gender'].notna().sum()
    
    print(f"  ✓ Saved to: {output_path}")
    print(f"    - Original song_id mapped: {mapped_songs:,} / {len(df_nn_mapped):,} ({mapped_songs/len(df_nn_mapped)*100:.1f}%)")
    print(f"    - Original msno mapped: {mapped_members:,} / {len(df_nn_mapped):,} ({mapped_members/len(df_nn_mapped)*100:.1f}%)")
    print(f"    - Artist name mapped: {mapped_artists:,} / {len(df_nn_mapped):,} ({mapped_artists/len(df_nn_mapped)*100:.1f}%)")
    print(f"    - Artist gender mapped: {mapped_gender:,} / {len(df_nn_mapped):,} ({mapped_gender/len(df_nn_mapped)*100:.1f}%)")
    
    processed_count_nn += 1

print("\n" + "="*60)
print(f"COMPLETE: Processed {processed_count_nn} NN files")
print(f"Output directory: {output_dir_nn}/")
print("="*60)


In [None]:
# Verify one of the processed NN files
print("\nVerifying processed NN file...")
sample_file_nn = os.path.join(output_dir_nn, os.path.basename(nn_files[0]))
df_sample_nn = pd.read_csv(sample_file_nn)
print(f"\nSample file: {os.path.basename(sample_file_nn)}")
print(f"Columns: {df_sample_nn.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_sample_nn.head(10))
