## billboard 200 cleaning

In [18]:
import pandas as pd
import os
from termcolor import colored
from IPython.display import display, HTML

# define path
data_dir = '../data/ranking_5'
# Exclude list
exclude_files = ['hot100.csv', 'charts.csv']

# 2. Ëé∑ÂèñÁõÆÂΩï‰∏ãÊâÄÊúâÁ¨¶ÂêàÊù°‰ª∂ÁöÑ CSV Êñá‰ª∂
# ËøáÊª§ÈÄªËæëÔºöÂêéÁºÄÊòØ .csv ‰∏î ‰∏çÂú®ÊéíÈô§ÂêçÂçï‰∏≠
other_csv_files = [
    f for f in os.listdir(data_dir)
    if f.endswith('.csv') and f not in exclude_files
]

# Dictionary to store dataframes: { 'filename': dataframe }
dfs = {}

print(colored(f"Starting batch import from {data_dir}...", 'blue'))

for file_name in other_csv_files:
    file_path = os.path.join(data_dir, file_name)
    try:
        # Âä®ÊÄÅÂàõÂª∫ÂèòÈáèÂêçÊàñÂ≠òÂÖ•Â≠óÂÖ∏
        # ‰ΩøÁî®Êñá‰ª∂ÂêçÔºàÂéªÊéâ.csvÔºâ‰Ωú‰∏∫ key
        df_name = file_name.replace('.csv', '')
        dfs[df_name] = pd.read_csv(file_path, dtype={4: str})

        print(colored(f"Successfully imported: {file_name} | Shape: {dfs[df_name].shape}", 'green'))
    except Exception as e:
        print(colored(f"Failed to import {file_name}: {e}", 'red'))

# 4. È™åËØÅÂØºÂÖ•ÁªìÊûú
print("-" * 30)
print(f"Total files imported: {len(dfs)}")
print(f"Available keys: {list(dfs.keys())}")

# View shape statistics for all imported files
print(colored("\n" + "=" * 40, 'blue'))
print(colored("Summary of Imported Datasets:", 'blue', attrs=['bold']))
print(colored("=" * 40, 'blue'))

# Iterate through each dataframe in the dictionary
for name, df in dfs.items():
    # ‰ΩøÁî® f-string Ê†ºÂºèÂåñËæìÂá∫ÔºåËÆ©ÁªìÊûúÂØπÈΩê
    print(f"Dataset: {name:<20} | Rows: {df.shape[0]:>8,} | Columns: {df.shape[1]:>2}")

print(colored("=" * 40, 'blue'))

[34mStarting batch import from ../data/ranking_5...[0m
[32mSuccessfully imported: digital_songs.csv | Shape: (52175, 8)[0m
[32mSuccessfully imported: billboard200.csv | Shape: (639746, 8)[0m
[32mSuccessfully imported: streaming_songs.csv | Shape: (33300, 8)[0m
[32mSuccessfully imported: radio.csv | Shape: (91300, 8)[0m
------------------------------
Total files imported: 4
Available keys: ['digital_songs', 'billboard200', 'streaming_songs', 'radio']
[34m
[1m[34mSummary of Imported Datasets:[0m
Dataset: digital_songs        | Rows:   52,175 | Columns:  8
Dataset: billboard200         | Rows:  639,746 | Columns:  8
Dataset: streaming_songs      | Rows:   33,300 | Columns:  8
Dataset: radio                | Rows:   91,300 | Columns:  8


In [19]:
new_columns = ['date', 'song', 'artist', 'rank', 'last_week', 'peak_rank', 'weeks_on_board']

for name, df in dfs.items():
    print(f"Cleaning: {name}")

    # 1. Áªü‰∏ÄÂàóÊï∞ (Ensuring consistent column count)
    if 'Image URL' in df.columns:
        df.drop(columns=['Image URL'], inplace=True)

    # 2. Áªü‰∏ÄÂàóÂêç (Standardizing column names)
    if len(df.columns) == len(new_columns):
        df.columns = new_columns
    else:
        print(colored(f"Warning: {name} structure mismatch!", 'red'))
        continue

        # 3. ËΩ¨Êç¢Êó•ÊúüÊ†ºÂºè (IMPORTANT: Date format conversion)
    # errors='coerce' ‰ºöÂ∞ÜÊó†Ê≥ïËß£ÊûêÁöÑÊó•ÊúüÂèò‰∏∫ NaT
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # 4. ÊñáÊú¨Ê∏ÖÊ¥ó (Text cleaning)
    df['song'] = df['song'].astype(str).str.strip().str.lower()
    df['artist'] = df['artist'].astype(str).str.strip().str.lower()

    # 5. Êï∞ÂÄºÂàóËΩ¨Êç¢ (Numeric conversion)
    cols_to_fix = ['rank', 'last_week', 'peak_rank', 'weeks_on_board']
    for col in cols_to_fix:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print(colored("Success! Date and other columns are all processed.", 'green'))

Cleaning: digital_songs
Cleaning: billboard200
Cleaning: streaming_songs
Cleaning: radio
[32mSuccess! Date and other columns are all processed.[0m


In [20]:
# ÈÅçÂéÜÂ≠óÂÖ∏‰∏≠ÁöÑÊØè‰∏Ä‰∏™ DataFrame
for name, df in dfs.items():
    # ÊâìÂç∞ÂΩ©Ëâ≤Ê†áÈ¢ò‰ª•Âå∫ÂàÜ‰∏çÂêåÁöÑË°®
    print("\n" + "=" * 50)
    print(colored(f" DATASET: {name.upper()} ", 'white', 'on_blue', attrs=['bold']))
    print("=" * 50)

    # 1. ËæìÂá∫ Info (Áî±‰∫é info ÈªòËÆ§Áõ¥Êé• printÔºåÊàë‰ª¨ÊâãÂä®Âä†ËØ¥Êòé)
    print(colored("\n[ 1. Basic Information ]", 'blue', attrs=['bold']))
    df.info()

    # 2. ËæìÂá∫ Describe (‰ΩøÁî® display ËÆ© DataFrame Âú® Notebook Èáå‰ª•Ë°®Ê†ºÂΩ¢ÂºèÁæéÂåñÂëàÁé∞)
    print(colored("\n[ 2. Statistical Summary ]", 'blue', attrs=['bold']))
    # T ÊòØËΩ¨ÁΩÆÁöÑÊÑèÊÄùÔºåÂ¶ÇÊûúÂàóÂæàÂ§öÔºåËΩ¨ÁΩÆÂêéÁúãÁªüËÆ°ÊåáÊ†á‰ºöÊõ¥ËàíÊúç
    display(df.describe().T)

    print("\n")


[1m[44m[97m DATASET: DIGITAL_SONGS [0m
[1m[34m
[ 1. Basic Information ][0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52175 entries, 0 to 52174
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            52175 non-null  datetime64[ns]
 1   song            52175 non-null  object        
 2   artist          52175 non-null  object        
 3   rank            52175 non-null  int64         
 4   last_week       52171 non-null  float64       
 5   peak_rank       52175 non-null  int64         
 6   weeks_on_board  42682 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 2.8+ MB
[1m[34m
[ 2. Statistical Summary ][0m


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,52175.0,2014-11-01 23:52:24.609487360,2004-10-27 00:00:00,2009-10-21 00:00:00,2014-10-22 00:00:00,2019-10-23 00:00:00,2025-10-22 00:00:00,
rank,52175.0,24.871107,1.0,12.0,24.0,37.0,50.0,14.417298
last_week,52171.0,23.253877,1.0,10.0,22.0,35.0,75.0,14.964232
peak_rank,52175.0,10.426909,1.0,1.0,5.0,16.0,114.0,11.997793
weeks_on_board,42682.0,14.325711,1.0,6.0,11.0,19.0,113.0,11.861434





[1m[44m[97m DATASET: BILLBOARD200 [0m
[1m[34m
[ 1. Basic Information ][0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639746 entries, 0 to 639745
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            639746 non-null  datetime64[ns]
 1   song            639746 non-null  object        
 2   artist          639746 non-null  object        
 3   rank            639746 non-null  int64         
 4   last_week       639738 non-null  float64       
 5   peak_rank       639746 non-null  int64         
 6   weeks_on_board  582734 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 34.2+ MB
[1m[34m
[ 2. Statistical Summary ][0m


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,639746.0,1995-02-22 07:34:52.036527104,1963-08-14 00:00:00,1979-10-31 00:00:00,1995-03-01 00:00:00,2010-06-30 00:00:00,2025-10-22 00:00:00,
rank,639746.0,99.355708,1.0,50.0,99.0,148.0,200.0,57.381735
last_week,639738.0,92.267305,1.0,43.0,91.0,139.0,200.0,56.347736
peak_rank,639746.0,35.430129,1.0,2.0,14.0,52.0,991.0,47.399443
weeks_on_board,582734.0,43.510871,1.0,7.0,17.0,42.0,988.0,82.25624





[1m[44m[97m DATASET: STREAMING_SONGS [0m
[1m[34m
[ 1. Basic Information ][0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33300 entries, 0 to 33299
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            33300 non-null  datetime64[ns]
 1   song            33300 non-null  object        
 2   artist          33300 non-null  object        
 3   rank            33300 non-null  int64         
 4   last_week       33300 non-null  int64         
 5   peak_rank       33300 non-null  int64         
 6   weeks_on_board  27768 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 1.8+ MB
[1m[34m
[ 2. Statistical Summary ][0m


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,33300.0,2019-06-08 11:59:59.999999744,2013-01-23 00:00:00,2016-03-30 00:00:00,2019-06-08 12:00:00,2022-08-17 00:00:00,2025-10-22 00:00:00,
rank,33300.0,25.5,1.0,13.0,25.5,38.0,50.0,14.431086
last_week,33300.0,23.182553,1.0,11.0,22.0,35.0,50.0,14.033142
peak_rank,33300.0,10.16006,1.0,1.0,5.0,15.0,148.0,11.95033
weeks_on_board,27768.0,17.505186,2.0,6.0,12.0,23.0,153.0,17.019799





[1m[44m[97m DATASET: RADIO [0m
[1m[34m
[ 1. Basic Information ][0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91300 entries, 0 to 91299
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            91300 non-null  datetime64[ns]
 1   song            91300 non-null  object        
 2   artist          91300 non-null  object        
 3   rank            91300 non-null  int64         
 4   last_week       91300 non-null  int64         
 5   peak_rank       91300 non-null  int64         
 6   weeks_on_board  88725 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 4.9+ MB
[1m[34m
[ 2. Statistical Summary ][0m


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,91300.0,2008-04-26 12:00:00,1990-10-31 00:00:00,1999-07-28 00:00:00,2008-04-26 12:00:00,2017-01-25 00:00:00,2025-10-22 00:00:00,
rank,91300.0,25.5,1.0,13.0,25.5,38.0,50.0,14.430949
last_week,91300.0,26.178653,1.0,13.0,26.0,39.0,89.0,15.42275
peak_rank,91300.0,18.805531,1.0,5.0,16.0,30.0,61.0,14.537344
weeks_on_board,88725.0,13.42634,2.0,7.0,11.0,17.0,95.0,9.536824






In [21]:
# Áªü‰∏ÄÁªüËÆ°Âπ∂ÂàóÂá∫ÊâÄÊúâÊñá‰ª∂ÁöÑÁº∫Â§±ÂÄº (Consolidated Missing Value Audit)
print(colored("=" * 50, 'blue', attrs=['bold']))
print(colored("MISSING VALUES SUMMARY BY DATASET", 'blue', attrs=['bold']))
print(colored("=" * 50, 'blue'))

for name, df in dfs.items():
    # ËÆ°ÁÆóÊØèÂàóÁöÑÁº∫Â§±ÂÄºÊÄªÊï∞
    null_counts = df.isnull().sum()

    # Âè™Á≠õÈÄâÂá∫Â≠òÂú®Áº∫Â§±ÂÄºÁöÑÂàó (Optional: only show columns with nulls)
    missing_data = null_counts[null_counts > 0]

    print(f"\nüìÇ Dataset: {colored(name.upper(), 'yellow', attrs=['bold'])}")
    print(f"Total Rows: {len(df):,}")

    if not missing_data.empty:
        # ÂàóÂá∫ÊØè‰∏™ÊúâÁº∫Â§±ÂÄºÁöÑÂ≠óÊÆµÂèäÂÖ∂Êï∞Èáè
        for col, count in missing_data.items():
            percentage = (count / len(df)) * 100
            print(f"  - {col:<15} : {count:>8,} missing ({percentage:>6.2f}%)")
    else:
        print(colored("  ‚úÖ No missing values found!", 'green'))

print(colored("\n" + "=" * 50, 'blue'))

[1m[34mMISSING VALUES SUMMARY BY DATASET[0m

üìÇ Dataset: [1m[33mDIGITAL_SONGS[0m
Total Rows: 52,175
  - last_week       :        4 missing (  0.01%)
  - weeks_on_board  :    9,493 missing ( 18.19%)

üìÇ Dataset: [1m[33mBILLBOARD200[0m
Total Rows: 639,746
  - last_week       :        8 missing (  0.00%)
  - weeks_on_board  :   57,012 missing (  8.91%)

üìÇ Dataset: [1m[33mSTREAMING_SONGS[0m
Total Rows: 33,300
  - weeks_on_board  :    5,532 missing ( 16.61%)

üìÇ Dataset: [1m[33mRADIO[0m
Total Rows: 91,300
  - weeks_on_board  :    2,575 missing (  2.82%)
[34m


In [22]:
# 1. Ëé∑ÂèñÂΩìÂâç notebook Êñá‰ª∂ÁöÑÁªùÂØπË∑ØÂæÑ (Get absolute path of current notebook)
current_dir = os.path.dirname(os.path.abspath('__file__'))

# 2. ÂÆö‰ΩçÂà∞È°πÁõÆÊ†πÁõÆÂΩï (Locate project root)
project_root = os.path.abspath(os.path.join(current_dir, ".."))

# 3. ÂÆö‰πâËæìÂá∫ÁõÆÂΩïË∑ØÂæÑ (Define output directory path)
output_dir = os.path.join(project_root, 'output', 'batch_cleaned')

# 4. Â¶ÇÊûúÁõÆÂΩï‰∏çÂ≠òÂú®ÂàôÂàõÂª∫ (Create directory if not exists)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 5. Âæ™ÁéØÂØºÂá∫Â≠óÂÖ∏‰∏≠ÁöÑÊØè‰∏™ DataFrame (Iterate and export from the dictionary)
for name, df in dfs.items():
    # ‰ΩøÁî®ÁªùÂØπË∑ØÂæÑÊãºÊé•Êñá‰ª∂Âêç (Construct file path using absolute positioning)
    file_name = f"{name}_cleaned.csv"
    file_path = os.path.join(output_dir, file_name)

    # ÊâßË°åÂØºÂá∫ (Execute export)
    df.to_csv(file_path, index=False, encoding='utf-8-sig')

    print(colored(f"File strictly saved to: {file_path}", 'green'))

print(colored("\nAll batch files have been successfully exported!", 'blue', attrs=['bold']))

[32mFile strictly saved to: /Users/jesse/Desktop/DA_prj1_Music/output/batch_cleaned/digital_songs_cleaned.csv[0m
[32mFile strictly saved to: /Users/jesse/Desktop/DA_prj1_Music/output/batch_cleaned/billboard200_cleaned.csv[0m
[32mFile strictly saved to: /Users/jesse/Desktop/DA_prj1_Music/output/batch_cleaned/streaming_songs_cleaned.csv[0m
[32mFile strictly saved to: /Users/jesse/Desktop/DA_prj1_Music/output/batch_cleaned/radio_cleaned.csv[0m
[1m[34m
All batch files have been successfully exported![0m
