In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

SEED = 1234

In [2]:
df = pd.read_csv('/Users/nvw3/Downloads/celeba/list_attr_celeba.csv')
df.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


In [3]:
df_smiling = df[df['Smiling'] == 1]
df_non_smiling = df[df['Smiling'] == -1]
print(f"""Length of smiling: {len(df_smiling)} ({len(df_smiling) / len(df)}) 
Length of non-smiling: {len(df_non_smiling)} ({len(df_non_smiling) / len(df)}) """)

Length of smiling: 97669 (0.48208036564839907) 
Length of non-smiling: 104930 (0.5179196343516009) 


In [4]:
# Now need to generate indices with different levels of gender imbalance.
df_male = df[df['Male'] == 1]
df_female = df[df['Male'] == -1]
print(f"""Length of male: {len(df_male)} ({len(df_male) / len(df)}) 
Length of female: {len(df_female)} ({len(df_female) / len(df)}) """)

Length of male: 84434 (0.41675427815537097) 
Length of female: 118165 (0.5832457218446291) 


In [16]:
df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]

print("male smile, female smile, male no smile,female no smile:",len(df_male_smiling),len(df_female_smiling),len(df_male_not_smiling),len(df_female_not_smiling))

male smile, female smile, male no smile,female no smile: 33798 63871 50636 54294


In [20]:
# Going to use a dataset of 30,000 images
# 15,000 smiling, 15,000 non-smiling
# Going to create 6 different datasets with differing levels of gender imbalance
# But 50/50 smiling vs non smiling

def rebalance_dataset(females, males, df):
    df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
    df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

    df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
    df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]
    """
    splitting data and ensureing that each group has the same amount by picking half samples from      each group
    TODO check why this wastn the case in the end though
    """
    
    if females > 0:
        rebalance_df_female_smiling = resample(df_female_smiling,
                                              replace=False,
                                              n_samples=int(females/2),
                                              random_state=SEED)
        
        rebalance_df_female_not_smiling = resample(df_female_not_smiling,
                                                  replace=False,
                                                  n_samples=int(females/2),
                                                  random_state=SEED)
    
    if males > 0:
        rebalance_df_male_smiling = resample(df_male_smiling,
                                              replace=False,
                                              n_samples=int(males/2),
                                              random_state=SEED)
        
        rebalance_df_male_not_smiling = resample(df_male_not_smiling,
                                                  replace=False,
                                                  n_samples=int(males/2),
                                                  random_state=SEED)
    if females == 0:
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling])
        return rebalance_df
        
    elif males == 0:
        rebalance_df = pd.concat([rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        return rebalance_df
        
    else:
        #Here they are probabbly added in oder
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling,
                                  rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        
        return rebalance_df


def test_rebalance(females, males, df):
    assert len(df[df['Male'] == -1]) == females
    assert len(df[df['Male'] == 1]) == males
    
    assert len(df[df['Smiling'] == -1]) == 15000 #Stopped assertion for small dataset
    assert len(df[df['Smiling'] == 1]) == 15000 # Checking its balanced
    
SAVE_FOLDER = '/Users/nvw3/Downloads/celeba/imbalanced_attr'
    
# 100% female, 0% male.
# 30000 female, 0 male.
df_30k_female = rebalance_dataset(females=30000, males=0, df=df)
test_rebalance(females=30000, males=0, df=df_30k_female)
df_30k_female.to_csv(f'{SAVE_FOLDER}/30k_female.csv', index=False)

#----- new addition below----
# 99.9% female, 0.1% male
# 29970 female, 30 male
df_29k97_female = rebalance_dataset(females=29970, males=30, df=df)
test_rebalance(females=29970, males=30, df=df_29k97_female)
df_29k97_female.to_csv(f'{SAVE_FOLDER}/29k97_female.csv', index=False)

# 99% female, 1% male
# 29700 female, 300 male
df_29k_female = rebalance_dataset(females=29700, males=300, df=df)
test_rebalance(females=29700, males=300, df=df_29k_female)
df_29k_female.to_csv(f'{SAVE_FOLDER}/29k_female.csv', index=False)

# 98% female, 2% male
female_n = 29400
male_n = 600
df_98i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_98i_female)
df_98i_female.to_csv(f'{SAVE_FOLDER}/98i_female.csv', index=False)

# 96% female, 4% male
female_n = 28800
male_n = 1200
df_96i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_96i_female)
df_96i_female.to_csv(f'{SAVE_FOLDER}/96i_female.csv', index=False)

# 95% female, 5% male
female_n = 28500
male_n = 1500
df_95i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_95i_female)
df_95i_female.to_csv(f'{SAVE_FOLDER}/95i_female.csv', index=False)

# 94% female, 6% male
female_n = 28200
male_n = 1800
df_94i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_94i_female)
df_94i_female.to_csv(f'{SAVE_FOLDER}/94i_female.csv', index=False)

# 92% female, 8% male
female_n = 27600
male_n = 2400
df_92i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_92i_female)
df_92i_female.to_csv(f'{SAVE_FOLDER}/92i_female.csv', index=False)


#----- new addition end ------ 

# 90% female, 10% male.
# 27000 female, 3000 male.
df_27k_female = rebalance_dataset(females=27000, males=3000, df=df)
test_rebalance(females=27000, males=3000, df=df_27k_female)
df_27k_female.to_csv(f'{SAVE_FOLDER}/27k_female.csv', index=False)


#------ new addition start -----
# 85% female, 15% male
female_n = 25500
male_n = 4500
df_85i_female = rebalance_dataset(females=female_n, males=male_n, df=df)
test_rebalance(females=female_n, males=male_n, df=df_85i_female)
df_85i_female.to_csv(f'{SAVE_FOLDER}/85i_female.csv', index=False)
#------ new addition end   -----

# 80% female, 20% male.
# 24000 female, 6000 male.
df_24k_female = rebalance_dataset(females=24000, males=6000, df=df)
test_rebalance(females=24000, males=6000, df=df_24k_female)
df_24k_female.to_csv(f'{SAVE_FOLDER}/24k_female.csv', index=False)

# 70% female, 30% male.
# 21000 female, 9000 male.
df_21k_female = rebalance_dataset(females=21000, males=9000, df=df)
test_rebalance(females=21000, males=9000, df=df_21k_female)
df_21k_female.to_csv(f'{SAVE_FOLDER}/21k_female.csv', index=False)

# 60% female, 40% male.
# 18000 female, 12000 male.
df_18k_female = rebalance_dataset(females=18000, males=12000, df=df)
test_rebalance(females=18000, males=12000, df=df_18k_female)
df_18k_female.to_csv(f'{SAVE_FOLDER}/18k_female.csv', index=False)

# 50% female, 50% male.
# 15000 female, 15000 male.
df_15k_female = rebalance_dataset(females=15000, males=15000, df=df)
test_rebalance(females=15000, males=15000, df=df_15k_female)
df_15k_female.to_csv(f'{SAVE_FOLDER}/15k_female.csv', index=False)



#The following are smaller versions of the bigger datasets to replicate results but in faster


# 100% female, 0% male.
# 30000 female, 0 male.
# df_30k_female = rebalance_dataset(females=29700, males=300, df=df)
# test_rebalance(females=30000, males=0, df=df_30k_female)
# df_30k_female.to_csv(f'{SAVE_FOLDER}/30k_female.csv', index=False)

# # 90% female, 10% male.
# # 270 female, 30 male.
# df_270_female = rebalance_dataset(females=270, males=30, df=df)
# test_rebalance(females=270, males=30, df=df_270_female)
# df_270_female.to_csv(f'{SAVE_FOLDER}/270_female.csv', index=False)

# # 90% female, 10% male.
# # 270 female, 30 male.
# df_2700_female = rebalance_dataset(females=2700, males=300, df=df)
# test_rebalance(females=2700, males=300, df=df_2700_female)
# df_2700_female.to_csv(f'{SAVE_FOLDER}/2700_female.csv', index=False)


#TESTING

# 95% female, 5% male.
# 270 female, 30 male.
#test_rebalance(females=2000, males=1000, df=df_29000_female)


df_21k_female_half = rebalance_dataset(females=10500, males=4500, df=df)

df_21k_female_half.to_csv(f'{SAVE_FOLDER}/21k_female_half.csv', index=False)


In [17]:
df_29000_female

NameError: name 'df_29000_female' is not defined

In [502]:
# Now need to generate indices with different levels of gender imbalance.
# df_male = df_balanced[df_balanced['Male'] == 1]
# df_female = df_balanced[df_balanced['Male'] == -1]
# print(f"""Length of male: {len(df_male)} ({len(df_male) / len(df_balanced)}) 
# Length of female: {len(df_female)} ({len(df_female) / len(df_balanced)}) """)

In [503]:
partition = pd.read_csv('/Users/nvw3/Downloads/celeba/list_eval_partition.csv')
partition

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0
...,...,...
202594,202595.jpg,2
202595,202596.jpg,2
202596,202597.jpg,2
202597,202598.jpg,2


In [504]:
partition['partition'].value_counts()

0    162770
2     19962
1     19867
Name: partition, dtype: int64

In [505]:
df.columns

Index(['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive',
       'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose',
       'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows',
       'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair',
       'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open',
       'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin',
       'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns',
       'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings',
       'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace',
       'Wearing_Necktie', 'Young'],
      dtype='object')

In [506]:
for i, name in enumerate([0, 1]):
    print(i, name)

0 0
1 1


In [507]:
males = df[df['Male'] == 1]
females = df[df['Male'] == -1]

In [508]:
young_males = df[(df['Male'] == 1) & (df['Young'] == 1)]
old_males = df[(df['Male'] == 1) & (df['Young'] == -1)]
young_females = df[(df['Male'] == -1) & (df['Young'] == 1)]
old_females = df[(df['Male'] == -1) & (df['Young'] == -1)]

In [509]:
print(f"""Young males: {len(young_males)}
Old males: {len(old_males)}
Young females: {len(young_females)}
Old females: {len(old_females)}""")

Young males: 53447
Old males: 30987
Young females: 103287
Old females: 14878


In [510]:
156734 / (156734 + 45865) # this means overly young people..and
# Could age have an effect aswell?

0.773616849046639

In [511]:
df['Wearing_Hat'].value_counts()

-1    192781
 1      9818
Name: Wearing_Hat, dtype: int64

In [512]:
protected_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
predictions = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

all_three = [protected_labels, labels, predictions]

female_predict_labels = []
female_correct_labels = []

for count, i in enumerate(protected_labels):
    if i == 0:
        female_predict_labels.append(predictions[count])
        female_correct_labels.append(labels[count])
        

In [513]:
protected_labels == labels

True