In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import os

In [None]:
import timeit
import time
from tqdm import tqdm

from multiprocessing.dummy import Pool
from more_itertools import sliced

from PIL import Image

import ast
from pandas.core.common import flatten

import urllib.request

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Preparing information about image merges for users

We create 3 merges (with 9 photos each) for each user to download them and mark up category manually. After that we will find out all users' categories and will expand category label to the other images merges if the following conditions are met:
* if merge from the beginning and from the middle have same labels or merge from the middle and from the end - we will mark all merges from images betweeen them with this category label
* if all 3 merges have same labels, we will mark all merges from all user images with this category label
* if all 3 merges have different labels or only first and last merges have the same labels, we will use only 3 existed merges in the following downloading and training (to prevent wrong labeling)

**NB!** Despite tha fact that we partially automated the mark up process, after downloading all images we manually checked images classes 

**Initial data:** We use prepared csv file `posts_for_work` with the following structure:
* id (for user id)
* date (date of the post publication)
* im_url (url for downloading the image file)
* like (number of likes for post [OPTIONAL])
* comments (number of posts'comments [OPTIONAL])
* followers_count (number of user's followers [OPTIONAL])

In [None]:
posts_df = pd.read_csv('posts_for_work.csv')
posts_df = posts_df[posts_df['is_downloaded'] == 1.]
posts_df = posts_df[posts_df['is_outlier'] == 0.]
posts_df = posts_df[['id', 'date', 'im_url', 'likes', 'comments', 'followers_count']]
posts_df = posts_df.reset_index(drop=True)

In [None]:
users_posts_df = posts_df.groupby('id').count()

In [None]:
users = list(users_posts_df.index.values)
counts = list(users_posts_df.date.values)

In [None]:
# Selecting users with >= 11 photos,
# to have at least 3 different merges (1: from 1 to 9 images, 2: 2-10, 3: 3-11)
users_for_photos_merge = []
for i in range(len(users)):
    if counts[i] >= 11:
        users_for_photos_merge.append(users[i])

In [None]:
posts_for_merge_df = posts_df[posts_df.id.isin(users_for_photos_merge)].reset_index(drop=True)
posts_for_merge_df.sort_values(by=['id', 'date'], ascending=False).reset_index(drop=True)

Unnamed: 0,id,date,im_url,likes,comments,followers_count
0,44700261741,1608465469,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,108,2,5632
1,44700261741,1608379102,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,134,1,5632
2,44700261741,1608278105,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,113,4,5632
3,44700261741,1607960552,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,275,0,5632
4,44700261741,1607931840,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,308,15,5632
...,...,...,...,...,...,...
151616,1091197,1596644657,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,6620,41,148084
151617,1091197,1596621865,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,10776,44,148084
151618,1091197,1596571325,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,7197,18,148084
151619,1091197,1596475464,https://instagram.fhel5-1.fna.fbcdn.net/v/t51....,10546,67,148084


In [None]:
# save selected posts
posts_for_merge_df.to_csv('posts_for_merge.csv', index=False)

In [None]:
# main part for images grouping into merges
user_names = []
merge_names = []
images_to_merge = []
for user_id in users_for_photos_merge:
    user_df = posts_df[posts_df['id'] == user_id].reset_index(drop=True)
    # ids of the first images in the start, middle and final merges
    merges_first_id = [0, int((len(user_df) - 9) // 2), len(user_df) - 9] 

    for start_i in range(len(merges_first_id)):
        user_imgs_to_merge = []
        start_id = merges_first_id[start_i]
        for i in range(9):
            user_imgs_to_merge.append(user_df.im_url[start_id + i])
  
    images_to_merge.append(user_imgs_to_merge)
    merge_names.append(str(user_id) + '_merge' + str(start_i + 1))
    user_names.append(user_id)

In [None]:
merges_dict = {'merge_name':merge_names, 'user_id':user_names, 'images':images_to_merge}
merges_df = pd.DataFrame.from_dict(merges_dict)

In [None]:
merges_df.to_csv('merges_for_labeling.csv')

## Images uploading and merges creation

In [2]:
merges = pd.read_csv('merges_for_labeling.csv')
merges = merges[['merge_name', 'user_id', 'images']]

# Fix arrays after uploading from csv (convert them from strings)
merges['images'] = list(map(ast.literal_eval, merges['images'].values))

In [None]:
# expand the merges lists of images urls into one list of all images urls when we load images in the first step
merge_list_list = list(merges.merge_name)
images_list = list(merges.images)
image_download_list = []
for i in range(len(merge_list_list)):
    k = 0
    for j in images_list[i]:
        new_name = merge_list_list[i] + '__%i' %k
        image_download_list.append([merge_list_list[i], new_name, j])
        k = k + 1

In [None]:
df_images = pd.DataFrame(image_download_list,columns=['merge_name','image_name','im_url'])

In [None]:
from more_itertools import sliced

In [None]:
images = list(zip(df_images.image_name, df_images.im_url))
group_len = 115
images_groups = list(sliced(images, group_len)) # for multiprocessing

In [None]:
# helper function for images downloading
def images_downloader(posts, main_path):
    for i in range(len(posts[:])): 
        # In the case of the second stage of loading, when images are downloaded to class folders and users, 
        # the parameter 'posts' length should be 4 and contain the class_lbl and user_id lists
        if len(posts) == 4:
            main_path = main_path + '/' + posts[i][2] + '/' + posts[i][3] + '/'
        try:
            urllib.request.urlretrieve(posts[i][1], myPath + posts[i][0])
        except:
            None
            
# main function for parallel downloading images by prepared urls
def parallel_downloading(images_groupes, it=0, step=10, n_processes=10)
    start = timeit.default_timer()

    while it <= len(images_groups1) - step:
        print(it)
        if __name__ == '__main__':
            pool = Pool(n_processes) # set processes number    
            result = pool.map(images_downloader, images_groups[it : it + step])
            pool.close()
            pool.join()
        it = it + step
        print('time - %f' %(timeit.default_timer() - start))
        
    # download last portion of data
    if __name__ == '__main__':
        pool = Pool(n_processes) # set processes number     
        result = pool.map(images_downloader, images_groups[it:])
        pool.close()
        pool.join()

In [None]:
%%time
# start downloading images to the <images_data_path> directory
parallel_downloading(images_groups, '<images_data_path>')

In [None]:
# checking downloaded images
downloaded_im = []
for i in files_posts:
    downloaded_im = downloaded_im + [i.replace('.jpg','')]

# set label 1 to those images in DataFrame, which were downloaded
df_images.loc[df_images.image_name.isin(downloaded_im), 'is_downloaded'] = 1
df_images['is_downloaded'] = df_images['is_downloaded'].fillna(0)

In [None]:
# update DataFrame
downl_post_count = df_images.groupby('merge_name')[['is_downloaded']].sum()

In [None]:
# select only those initial merges, where all 9 images were downloaded
df_images = df_images[df_images.merge_name.isin(downl_post_count[downl_post_count.is_downloaded==9].index)]

In [None]:
# group images back for merges creation
for_merges = df_images.groupby('merge_name')['image_name'].apply(list).to_dict()
for_merges_list = [[k,v] for k,v in zip(for_merges.keys(),for_merges.values())]

# create groups for parallel merges creation  
merges_groups = list(sliced(for_merges_list, 50))

In [None]:
# function for 9 images concatenation
def concat_imgs(imgs_lst):
    img_h = imgs_lst[0].height
    img_w = imgs_lst[0].width
    dst = Image.new('RGB', (3 * img_w, 3 * img_h))
    for i in range(3):
        for j in range(3):
            dst.paste(imgs_lst[i * 3 + j], (img_w * j, img_h * i))
    return dst

In [None]:
# main images merging function
def merge_images(parameters):
    for parameter_group in parameters:
        merges_list = parameter_group[1]
        merges_name = parameter_group[0]

        merge = merges_list
        uploaded_imgs = []
        for img_name in merge:
            uploaded_imgs.append(Image.open('<data_for_merges_path>/' + img_name + '.jpg'))
        image = concat_imgs(uploaded_imgs)
        image.save('<created_merges_path>/' + merges_name + '.jpg')

def parallel_merges_creation(merges_groups, it = 1, step = 10, n_processes=10)
  start = timeit.default_timer()

    while it <= len(merges_groups) - step:
        print(it)
        if __name__ == '__main__':
            pool = Pool(n_processes) # set number of processes   
            result = pool.map(merge_images, merges_groups[it : it + step])
            pool.close()
            pool.join()
        it = it + step
        print('time - %f' %(timeit.default_timer() - start))
    # final portion of images to merge
    if __name__ == '__main__':
        pool = Pool(n_processes) # set number of processes   
        result = pool.map(merge_images, merges_groups[it:])
        pool.close()
        pool.join()

In [None]:
%%time
parallel_merges_creation(merges_groups)

In [None]:
# create final excel file with merges names (for labeling)
files_posts = os.listdir('merges')
merges_labeling = pd.DataFrame(files_posts,columns=['merge_name'])
merges_labeling.to_excel('ready_merges_for_labeling.xlsx', index=False)