In [1]:
import pandas as pd
import os

In [2]:
# --- CONFIGURATION ---
# Use absolute paths or '.' if local. 
# 'os.path.abspath' below will fix the '.' ugliness automatically.
ORIGINAL_DATA_ROOT = '.' 
ORIGINAL_CSV_PATH = os.path.join(ORIGINAL_DATA_ROOT, 'labels.csv')
AUGMENTED_DATA_ROOT = 'augmented'
OUTPUT_CSV_PATH = 'combined_labels.csv'

In [3]:
def create_master_csv():
    print(" Building Clean Master CSV...")
    
    # --- 1. PROCESS ORIGINAL DATA ---
    print("   Processing Original Dataset...")
    if os.path.exists(ORIGINAL_CSV_PATH):
        df_orig = pd.read_csv(ORIGINAL_CSV_PATH)
        df_orig.columns = [c.strip() for c in df_orig.columns]

        # 1. Join path
        # 2. normpath (cleans up A/./B stuff)
        # 3. replace \ with / (Enforces Linux style even on Windows)
        df_orig['filepath'] = df_orig['pth'].apply(
            lambda x: os.path.normpath(os.path.join(ORIGINAL_DATA_ROOT, x)).replace('\\', '/')
        )
        
        df_orig['label'] = df_orig['label'].astype(str).str.strip().str.lower()
        df_orig_clean = df_orig[['filepath', 'label']].copy()
        df_orig_clean['source'] = 'original'
        print(f"   -> Added {len(df_orig_clean)} original images.")
    else:
        print("   ️ Original CSV not found. Skipping.")
        df_orig_clean = pd.DataFrame()

    # --- 2. PROCESS AUGMENTED DATA (Direct Folder Scan) ---
    print("   Scanning Augmented Folders...")
    aug_data = []
    
    if os.path.exists(AUGMENTED_DATA_ROOT):
        for root, dirs, files in os.walk(AUGMENTED_DATA_ROOT):
            for file in files:
                if file.lower().endswith(('.jpg', '.png', '.jpeg')):
                    folder_name = os.path.basename(root)
                    label = folder_name.replace('aug_', '').lower()
                    
                    # LOGIC FIX:
                    # os.path.join might create backslashes on Windows.
                    # We force them to forward slashes immediately.
                    full_path = os.path.join(root, file)
                    clean_path = os.path.normpath(full_path).replace('\\', '/')
                    
                    aug_data.append({
                        'filepath': clean_path,
                        'label': label,
                        'source': 'augmented'
                    })
    else:
        print(f"   ️ Folder {AUGMENTED_DATA_ROOT} not found.")

    df_aug_clean = pd.DataFrame(aug_data)
    print(f"   -> Added {len(df_aug_clean)} augmented images.")
    
    # --- 3. MERGE & SAVE ---
    final_df = pd.concat([df_orig_clean, df_aug_clean], ignore_index=True)
    
    # Shuffle
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    final_df.to_csv(OUTPUT_CSV_PATH, index=False)
    
    print("-" * 30)
    print(f" DONE! Saved to: {OUTPUT_CSV_PATH}")
    print(f"Sample paths:")
    print(final_df['filepath'].head(3).values)

In [4]:
create_master_csv()

 Building Clean Master CSV...
   Processing Original Dataset...
   -> Added 31002 original images.
   Scanning Augmented Folders...
   -> Added 8110 augmented images.
------------------------------
 DONE! Saved to: combined_labels.csv
Sample paths:
['surprise/image0027442.jpg' 'sad/image0027758.jpg' 'sad/image0020564.jpg']


In [6]:
# Count rows per unique value in "augmented_label" column of the CSV
df_aug = pd.read_csv("combined_labels.csv")
target_col = 'label'
# allow case-insensitive match
cols_map = {c.lower(): c for c in df_aug.columns}
if target_col not in cols_map:
    print(f"Column '{target_col}' not found. Available columns: {list(df_aug.columns)}")
else:
    col = cols_map[target_col]
    counts = df_aug[col].astype(str).str.strip().value_counts()
    print("Number of rows per unique label:")
    print(counts)

Number of rows per unique label:
label
fear        4889
sad         4889
neutral     4889
contempt    4889
disgust     4889
surprise    4889
anger       4889
happy       4889
Name: count, dtype: int64


In [7]:
# Count labels but only for rows where source == 'augmented' (case-insensitive)
target_col = 'label'
source_col = 'source'

cols_map = {c.lower(): c for c in df_aug.columns}
if target_col not in cols_map:
    print(f"Column '{target_col}' not found. Available columns: {list(df_aug.columns)}")
elif source_col not in cols_map:
    print(f"Column '{source_col}' not found. Available columns: {list(df_aug.columns)}")
else:
    col = cols_map[target_col]
    src = cols_map[source_col]
    df_augmented = df_aug[df_aug[src].astype(str).str.strip().str.lower() == 'augmented']
    print(f"Total augmented rows: {len(df_augmented)}")
    counts = df_augmented[col].astype(str).str.strip().value_counts()
    print("Number of rows per unique label (augmented only):")
    print(counts)

Total augmented rows: 8110
Number of rows per unique label (augmented only):
label
neutral     1787
sad         1537
contempt    1301
fear        1136
disgust     1113
anger        729
happy        507
Name: count, dtype: int64
