In [1]:
import cv2
import os
import matplotlib.pyplot as plt
import pandas as pd
import random
import shutil

In [2]:
influencers_data = '/Users/mshayganfar/sb_capstone/data/influencers.csv'

In [3]:
df_influencers = pd.read_csv(influencers_data)

In [4]:
df_influencers

Unnamed: 0,username,category,num_followers,num_followees,num_posts
0,makeupbynvs,beauty,1432.0,1089.0,363.0
1,jaquelinevandoski,beauty,137600.0,548.0,569.0
2,anisaartistry,beauty,64644.0,289.0,391.0
3,rubina_muartistry,beauty,496406.0,742.0,887.0
4,beautyxabbi,beauty,2050.0,1423.0,751.0
...,...,...,...,...,...
33929,alingzhang,other,4597.0,881.0,365.0
33930,flower.jini,other,27093.0,1599.0,664.0
33931,ester_starling,other,20621.0,23.0,783.0
33932,lovely___yul,other,12381.0,811.0,223.0


In [5]:
beauty_influencers_count = df_influencers[df_influencers['category'] == 'beauty'].username.count()

print(f"Beauty influencers count: {beauty_influencers_count}")

Beauty influencers count: 1541


In [8]:
NUM_OF_INFLUENCERS = 30
NUM_OF_FILES_PER_INFLUENCER = 10

In [64]:
# Randomly picking N usernames

category_cum_sum = 0
prev_count = 0

username_list = []

random.seed(42)
random_numbers = random.sample(range(0, beauty_influencers_count-1), NUM_OF_INFLUENCERS)

for row_index in random_numbers:
    username_list.append(df_influencers.iloc[row_index].username)
    
print(username_list)

['aurasmakeupbox', '_themakeupdoll', 'ohmymisty', 'aminaali_mua', 'tulipheels95', 'eucarinasoares', 'superstarfitandbeautytv', 'malvikasitlaniofficial', 'danielmartin', 'kourtneydee', 'fefakuniyoshi', 'voirbelle', 'pretaraujo', 'rowisingh', 'lavishlybritt', 'sarahpatricia_gill', 'morganlouise.plus', 'aimee.conroy', 'jamiemakeup', 'lifeshehas', 'creatingvisionsmua', 'gabriellecroix', 'luferraes', 'jjj_sso', 'vanillatrapsoul', 'j.mlx', 'teteclementino', 'wiskola', 'manudip74', 'ingasglam']


In [22]:
image_folder = '/Users/mshayganfar/Documents/Mahni/Influencers/images/'

In [65]:
# Reading image filenames.

filenames_list = []

for username in username_list:
    first_char = username[0]
    if first_char.isalpha() == False:
        first_char = '_'
    # check if file exist in destination
    if os.path.exists(image_folder + first_char):
        specific_folder = image_folder + first_char + '/'
        filenames = [filename for filename in os.listdir(specific_folder + '.') if filename.startswith(username)]
        filenames_list.append(filenames)
    else:
        print(f"Folder {first_char} doesn't exist!")

In [66]:
# Subsampling M number of image files per influencer.

subsampled_filename_list = []

random.seed(42)

for i in range(0, len(filenames_list)):
    influencer_image_filenames = []
    random_numbers = random.sample(range(0, len(filenames_list[i])-1), NUM_OF_FILES_PER_INFLUENCER)
    
    for file_index in random_numbers:
        influencer_image_filenames.append(filenames_list[i][file_index])
        
    subsampled_filename_list.append(influencer_image_filenames)

In [35]:
src_image_folders_base     = '/Users/mshayganfar/Documents/Mahni/Influencers/images/'
dst_image_folders_base     = '/Users/mshayganfar/Documents/Mahni/Influencers/Beauty/subset_images/'
resized_image_folders_base = '/Users/mshayganfar/Documents/Mahni/Influencers/Beauty/resized_images/'

In [67]:
# Copying the subsampled files into the destination folder.

for i in range(0, len(subsampled_filename_list)):
    for j in range(0, len(subsampled_filename_list[i])):
        filename = subsampled_filename_list[i][j]
        first_char = filename[0]
        if first_char.isalpha() == False:
            first_char = '_'
        # check if file exist in destination
        if os.path.exists(src_image_folders_base + first_char):
            specific_src_folder = src_image_folders_base + first_char + '/'
            shutil.copy(specific_src_folder + filename, dst_image_folders_base + filename)
        else:
            print(f"Folder {first_char} doesn't exist!")

### Image Preparation

In [37]:
def load_image(image_path, image_name):
    image = cv2.imread(os.path.join(image_path, image_name))
    # print(os.path.join(image_path, image_name))
    return image

In [38]:
def resize_image(src_image, width, height):
    # print('Original Image Dimensions: ', src_image.shape)
    
    dim = (width, height)
    
    # resize image
    resized_image = cv2.resize(src_image, dim, interpolation=cv2.INTER_AREA)
    
    # print('Resized Image Dimensions : ', resized_image.shape)
    
    return resized_image

In [39]:
def save_image(image_path, image_name, image):
    cv2.imwrite(os.path.join(image_path , image_name), image)

In [68]:
# Resizing and saving subsampled images into a new folder.

filenames = os.listdir(dst_image_folders_base + '.')
for filename in filenames:
    if filename[-3:] == 'jpg':
        loaded_image  = load_image(dst_image_folders_base, filename)
        resized_image = resize_image(loaded_image, 50, 50)
        save_image(resized_image_folders_base, 'small_' + filename, resized_image)

### Load Images

In [41]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [42]:
batch_size   = 32
image_height = 50
image_width  = 50

In [43]:
train_images = tf.keras.preprocessing.image_dataset_from_directory(
    resized_image_folders_base,
    labels='inferred',
    label_mode='int',
    class_names=['eye', 'face', 'hair', 'nail', 'products'],
    color_mode='rgb',
    batch_size=batch_size,
    image_size=(image_height, image_width),
    shuffle=True,
    seed=42,
    validation_split=0.15,
    subset="training"
)

ValueError: The `class_names` passed did not match the names of the subdirectories of the target directory. Expected: [], but received: ['eye', 'face', 'hair', 'nail', 'products']