In [1]:
import json
import polars as pl
from pathlib import Path

In [2]:
def df_prep(df_paths, emotions, primary_face, from_ntkb=True):
    #  adjust paths if running frome notebooks dir (testing)
    if from_ntkb:
        df_paths = ['../' + path for path in df_paths]

    df = pl.read_parquet(df_paths)
    df = df.with_columns(
        no_ext = pl.col('image_id').str.strip_suffix('.jpg')
    )    
    df = df.with_columns(
        clean_id = pl.col('no_ext').str.split('_').list.last()
    )

    df = df.drop_nulls(subset=["face_path"])
    df = df.filter(pl.col("emotion").is_in(emotions))
    if primary_face:
        df = df.filter(pl.col("face_index") == 0)

    #CHECK FOR DUPES 
    total_rows = len(df)
    unique_rows = len(df.unique(subset=["clean_id"]))
    duplicates_count = total_rows - unique_rows
    
    print(f"\nTotal number of rows with duplicates in 'id' column (rows removed): {duplicates_count}")
    # Drop duplicate rows based on the "id" column, keeping the first occurrence
    df = df.unique(subset=["clean_id"], keep="first")
    
    print("\nDataFrame shape after dropping duplicates:")
    print(df.shape)

    return df

In [3]:
def process_single_sources(config):
    output_dir = Path(config['output_dir'])
    output_dir.mkdir(parents=True, exist_ok=True)

    emotions = config['emotions']
    primary_face = config['primary_face']
    
    for name, path in config['single_sources'].items():
        df = df_prep([path], emotions=emotions, primary_face=primary_face, from_ntkb=True)
        out_file = name + '.parquet'
        out_path = output_dir / out_file
        df.write_parquet(out_path)

In [4]:
config_path = '../configs/aggregation_mixing/input_merging.json'
with open(config_path, "r") as f:
    config = json.load(f)

In [5]:
config

{'output_dir': '../configs/aggregation_mixing',
 'emotions': ['angry', 'fear', 'happy', 'sad', 'surprise'],
 'primary_face': True,
 'single_sources': {'pexels_v1': 'data/processed/pexels_v1/summaries/training_data.parquet',
  'pexels_v2': 'data/processed/pexels_v2/summaries/training_data.parquet',
  'pixabay_v1': 'data/processed/pixabay_v1/summaries/training_data.parquet',
  'pixabay_v2': 'data/processed/pixabay_v2/summaries/training_data.parquet'},
 'combine_sources': {'pexels_v3': ['pexels_v1', 'pexels_v2'],
  'pixabay_v3': ['pixabay_v1', 'pixabay_v2'],
  'pexpix_v1': ['pexels_v1', 'pixabay_v1'],
  'pexpix_v2': ['pexels_v2', 'pixabay_v2'],
  'pexpix_v3': ['pexels_v1', 'pixabay_v1', 'pexels_v2', 'pixabay_v2']},
 'fer2013': '../data/fer-2013/',
 'raf_db': '../data/raf-db/DATASET/'}

In [6]:
process_single_sources(config)


Total number of rows with duplicates in 'id' column (rows removed): 0

DataFrame shape after dropping duplicates:
(2242, 24)

Total number of rows with duplicates in 'id' column (rows removed): 0

DataFrame shape after dropping duplicates:
(4517, 24)

Total number of rows with duplicates in 'id' column (rows removed): 0

DataFrame shape after dropping duplicates:
(1553, 24)

Total number of rows with duplicates in 'id' column (rows removed): 0

DataFrame shape after dropping duplicates:
(2132, 24)


In [7]:
def combine_sources(config):
    output_dir = Path(config['output_dir'])
    output_dir.mkdir(parents=True, exist_ok=True)

    emotions = config['emotions']
    primary_face = config['primary_face']

    for name, sources in config['combine_sources'].items():
        input_files = []
        for source in sources:
            path = config["single_sources"][source]
            input_files.append(path)

        df = df_prep(input_files, emotions=emotions, primary_face=primary_face, from_ntkb=True)
        out_file = name + '.parquet'
        out_path = output_dir / out_file
        df.write_parquet(out_path)

In [8]:
combine_sources(config)


Total number of rows with duplicates in 'id' column (rows removed): 923

DataFrame shape after dropping duplicates:
(5836, 24)

Total number of rows with duplicates in 'id' column (rows removed): 549

DataFrame shape after dropping duplicates:
(3136, 24)

Total number of rows with duplicates in 'id' column (rows removed): 0

DataFrame shape after dropping duplicates:
(3795, 24)

Total number of rows with duplicates in 'id' column (rows removed): 1

DataFrame shape after dropping duplicates:
(6648, 24)

Total number of rows with duplicates in 'id' column (rows removed): 1474

DataFrame shape after dropping duplicates:
(8970, 24)


In [9]:
def prep_fer(config):
    output_dir = Path(config['output_dir'])
    output_dir.mkdir(parents=True, exist_ok=True)

    emotions = config['emotions']
    primary_face = config['primary_face']

    fer_path = Path(config.get("fer2013"))

    records = []

    for usage_dir in fer_path.iterdir():
        if not usage_dir.is_dir():
                continue

        usage = usage_dir.name

        for emotion_dir in usage_dir.iterdir():
            if not emotion_dir.is_dir():
                continue
                
            emotion = emotion_dir.name
            
            for img_path in emotion_dir.glob('*.jpg'):
                records.append({
                    'data_source': "fer2013",
                    'usage': usage,
                    'emotion': emotion,
                    'face_path': 'data/' + str(img_path.relative_to(fer_path.parent))
                })    
        
    df = pl.DataFrame(records)
    df.write_parquet(fer_path / "all_images.parquet")    

    df = df.filter(pl.col("emotion").is_in(emotions))
    df.write_parquet(fer_path / "selected_emotions_images.parquet")    

    train = df.filter(pl.col("usage")=="train")
    train.write_parquet(fer_path / "train_images.parquet")    
    train.write_parquet(output_dir / "fer_2013.parquet")    
    
    test = df.filter(pl.col("usage")=="train")
    test.write_parquet(fer_path / "test_images.parquet")    

In [10]:
prep_fer(config)

In [11]:
def prep_raf(config):
    output_dir = Path(config['output_dir'])
    output_dir.mkdir(parents=True, exist_ok=True)

    emotions = config['emotions']
    primary_face = config['primary_face']

    raf_path = Path(config.get("raf_db"))

    records = []

    for usage_dir in raf_path.iterdir():
        if not usage_dir.is_dir():
                continue

        usage = usage_dir.name

        for emotion_dir in usage_dir.iterdir():
            if not emotion_dir.is_dir():
                continue
                
            emotion = emotion_dir.name
            
            for img_path in emotion_dir.glob('*.jpg'):
                records.append({
                    'data_source': "raf_db",
                    'usage': usage,
                    'emotion': emotion,
                    'face_path': 'data/' + str(img_path.relative_to(raf_path.parent.parent))
                })    
        
    df = pl.DataFrame(records)
    df.write_parquet(raf_path / "all_images.parquet")    

    df = df.filter(pl.col("emotion").is_in(emotions))
    df.write_parquet(raf_path / "selected_emotions_images.parquet")    

    train = df.filter(pl.col("usage")=="train")
    train.write_parquet(raf_path / "train_images.parquet")    
    train.write_parquet(output_dir / "raf_db.parquet")    
    
    test = df.filter(pl.col("usage")=="train")
    test.write_parquet(raf_path / "test_images.parquet")    

In [12]:
prep_raf(config)