In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
%matplotlib inline

In [2]:
df = pd.read_csv("./data_source.csv")
print("Total records = ", len(df))
df.head()

Total records =  64084


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,please_select_the_gender_of_the_person_in_the_picture,please_select_the_gender_of_the_person_in_the_picture:confidence,image_url,please_select_the_gender_of_the_person_in_the_picture_gold,user_id
0,1023132475,False,finalized,1,8/19/2016 17:00:25,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/40...,,40
1,1023132476,False,finalized,1,8/19/2016 17:00:48,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/42...,,42
2,1023132477,False,finalized,1,8/19/2016 17:01:43,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/44...,,44
3,1023132478,False,finalized,1,8/19/2016 17:01:04,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/47...,,47
4,1023132479,False,finalized,1,8/19/2016 17:00:48,male,1.0,https://d1qb2nb5cznatu.cloudfront.net/users/50...,,50


In [3]:
# select only the columns that we are interested in
df = df[["_unit_id", "please_select_the_gender_of_the_person_in_the_picture",
    "please_select_the_gender_of_the_person_in_the_picture:confidence", "image_url"]]
 
# rename the columns
df.columns = ["id", "gender", "confidence", "url"]
 
# only select the rows that has confidence of 1.0
df = df[df["confidence"] == 1]
 
print("Total records = ", len(df))

Total records =  64075


In [4]:
df.groupby("gender").count()

Unnamed: 0_level_0,id,confidence,url
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,7364,7364,7364
male,47592,47592,47592
unsure,9119,9119,9119


In [5]:
df_male = df[df["gender"] == "male"]
df_female = df[df["gender"] == "female"]
 
# to make both categories have equal number of samples
# we'll take the counts of the category that has lowest
# number of samples
min_samples = min(len(df_male), len(df_female))
 
# for indexing randomly
p = np.random.permutation(min_samples)
 
df_male = df_male.iloc[p]
df_female = df_female.iloc[p]
 
print("Total male samples = ", len(df_male))
print("Total female samples = ", len(df_female))
 
df = pd.concat([df_male, df_female]) 

Total male samples =  7364
Total female samples =  7364


In [6]:
import os
import requests
from io import BytesIO
from PIL import Image
 
def download_images(df, data_dir="./data"):
    genders = df["gender"].unique()
    for g in genders:
        g_dir = "{}/{}".format(data_dir, g)
        if not os.path.exists(g_dir):
            os.makedirs(g_dir)
           
    for index, row in tqdm.tqdm_notebook(df.iterrows()):
        filepath = "{}/{}/{}.jpg".format(data_dir, row["gender"], row["id"])
        if os.path.exists(filepath):
            continue
        try:
            resp = requests.get(row["url"])
            im = Image.open(BytesIO(resp.content))
            im.save(filepath)
        except:
            print("Error while downloading %s" % row["url"])
 
DATA_DIR = "./data"
download_images(df, data_dir=DATA_DIR)  
 
# create train/test folder for each gender
import glob
 
TRAIN_DIR = DATA_DIR + "/train"
TEST_DIR = DATA_DIR + "/test"
 
for d in [TRAIN_DIR, TEST_DIR]:
    for g in df["gender"].unique():
        final_dir = "{}/{}".format(d, g)
        if not os.path.exists(final_dir):
            os.makedirs(final_dir)
 
from random import shuffle
import math
import shutil
 
split_ratio = 0.7 # we'll reserve 70% of the images for training set
 
def validate_and_move(files, target_dir):
    for f in tqdm.tqdm_notebook(files):
        # try to open the file to make sure that this is not corrupted
        try:
            im = Image.open(f)
            shutil.copy(f, target_dir)
        except:
            pass
#             os.remove(f)
 
for gender in df["gender"].unique():
    gender_dir = "{}/{}".format(DATA_DIR, gender)
    pattern = "{}/*.jpg".format(gender_dir)
    all_files = glob.glob(pattern)
    shuffle(all_files)
   
    train_up_to = math.ceil(len(all_files) * split_ratio)
    train_files = all_files[:train_up_to]
    test_files = all_files[train_up_to:]
   
   
    validate_and_move(train_files, TRAIN_DIR + "/" + gender)
    validate_and_move(test_files, TEST_DIR + "/" + gender)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/48726-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/28727-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1387-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/30330-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/5745-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/6216-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/35753-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/68900-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1714-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/68239-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/43965-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/19991-large
Error while downloading https://d1qb2nb5czna

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/41143-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/24460-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/20216-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/21709-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/55858-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/9505-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/60827-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/51121-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/75757-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/3892-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/16746-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/70710-large
Error while downloading https://d1qb2nb5cz

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/30046-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/598-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/51242-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/314-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/44760-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/36411-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/69966-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/75528-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/5869-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/34736-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/67356-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/36351-large
Error while downloading https://d1qb2nb5cznat

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/39930-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/38312-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/55095-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/59534-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/64943-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/73592-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/19821-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/48476-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/3956-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/16541-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/70577-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/40967-large
Error while downloading https://d1qb2nb5c

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/37578-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/39345-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/61492-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/69059-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/44889-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/6440-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/54324-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/4469-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/44954-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/46729-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/36231-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/31813-large
Error while downloading https://d1qb2nb5cz

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/54301-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1332505-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/442516-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/687266-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/676284-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/292652-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/502430-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1183184-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1003201-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/549135-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/479627-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/978101-large
Error while downloading ht

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/551554-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1081867-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/957548-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1263966-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/322651-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/30926-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1211998-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/245727-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/640997-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/560229-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/529664-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1121569-large
Error while downloading h

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/557424-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/66142-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1037012-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/608859-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/847882-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1269284-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1481900-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1597170-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1346233-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/428525-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/743923-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1246011-large
Error while downloading

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/790322-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/167996-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1062958-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1598616-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/32677-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/43288-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/179392-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/698257-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/442267-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/212067-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/800462-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/71883-large
Error while downloading https

Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/435425-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1257338-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/966058-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1018827-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1397963-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1575804-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/256909-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/583123-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/337744-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/566223-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/1526556-large
Error while downloading https://d1qb2nb5cznatu.cloudfront.net/users/905057-large
Error while downloading

HBox(children=(IntProgress(value=0, max=5096), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2184), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5073), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2174), HTML(value='')))


