In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

SEED = 1234 #trace1

In [2]:
#Show beginning of dataset EDIT to your folder
df = pd.read_csv('/Users/nvw3/Downloads/celeba/list_attr_celeba.csv')
df.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


In [3]:
df_smiling = df[df['Smiling'] == 1]
df_non_smiling = df[df['Smiling'] == -1]
print(f"""Length of smiling: {len(df_smiling)} ({len(df_smiling) / len(df)}) 
Length of non-smiling: {len(df_non_smiling)} ({len(df_non_smiling) / len(df)}) """)

Length of smiling: 97669 (0.48208036564839907) 
Length of non-smiling: 104930 (0.5179196343516009) 


In [4]:
# Now need to generate indices with different levels of gender imbalance.
df_male = df[df['Male'] == 1]
df_female = df[df['Male'] == -1]
print(f"""Length of male: {len(df_male)} ({len(df_male) / len(df)}) 
Length of female: {len(df_female)} ({len(df_female) / len(df)}) """)

Length of male: 84434 (0.41675427815537097) 
Length of female: 118165 (0.5832457218446291) 


In [5]:
#Overview of subgroups in dataset.
df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]

print("male smile, female smile, male no smile,female no smile:",len(df_male_smiling),len(df_female_smiling),len(df_male_not_smiling),len(df_female_not_smiling))

male smile, female smile, male no smile,female no smile: 33798 63871 50636 54294


In [9]:
# Going to use a dataset of 30,000 images
# 15,000 smiling, 15,000 non-smiling

def rebalance_dataset(females, males, df):
    df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
    df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

    df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
    df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]
    """
    Splitting data and ensureing that each group has the same amount by picking half samples from           
    each group. trace1
    female: int: number of female samples to be used
    male: int: no of male smaple to be used
    df: numpy dataframe: dataframe to be altered and returned
    """
    
    if females > 0:
        rebalance_df_female_smiling = resample(df_female_smiling,
                                              replace=False,
                                              n_samples=int(females/2),
                                              random_state=SEED)
        
        rebalance_df_female_not_smiling = resample(df_female_not_smiling,
                                                  replace=False,
                                                  n_samples=int(females/2),
                                                  random_state=SEED)
    
    if males > 0:
        rebalance_df_male_smiling = resample(df_male_smiling,
                                              replace=False,
                                              n_samples=int(males/2),
                                              random_state=SEED)
        
        rebalance_df_male_not_smiling = resample(df_male_not_smiling,
                                                  replace=False,
                                                  n_samples=int(males/2),
                                                  random_state=SEED)
    if females == 0:
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling])
        return rebalance_df
        
    elif males == 0:
        rebalance_df = pd.concat([rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        return rebalance_df
        
    else:
        #Different types added in order trace1
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling,
                                  rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        
        return rebalance_df


def test_rebalance(females, males, df):
    """
    Testing that the rebalancing worked out fine.
    """
    assert len(df[df['Male'] == -1]) == females
    assert len(df[df['Male'] == 1]) == males
    
    assert len(df[df['Smiling'] == -1]) == 15000 # Stopped assertion for small dataset
    assert len(df[df['Smiling'] == 1]) == 15000 # Checking its balanced
    
#EDIT the path up till celeba
SAVE_FOLDER = '/Users/nvw3/Downloads/celeba/imbalanced_attr'


#Multiple dataset splits
total = 30000
#Item being the
femalePercentageList = [
    1,
    .999,
    .99,
    .98,
    .96,
    .95,
    .94,
    .92,
    .9,
    .85,
    .8,
    .7,
    .5
]

# EDIT this file to be stored in desired location:
SAVE_FOLDER = '/Users/nvw3/Downloads/celeba/imbalanced_attr'
imbalance_label = 'test'

#Create differently imbalanced CSV's nased on the Percentage List to be named in the yaml file.

for imbalance in femalePercentageList:
    f_samples = int(total*imbalance) #to int swap trace1
    m_samples = int(total-f_samples)
    imbalance_label = str(imbalance)
    current_df = rebalance_dataset(females=f_samples, males=m_samples, df=df)
    test_rebalance(females=f_samples, males=m_samples, df=current_df)
    current_df.to_csv(SAVE_FOLDER+'/'+imbalance_label+'.csv', index=False)
