# Import and functions

In [1]:
import sys
import os
import re
import gzip
import pathlib
import dhash
import pandas as pd
import shutil

from tqdm import trange, tqdm

from PIL import Image
from pillow_heif import register_heif_opener

import time
import datetime

import signal

In [10]:
extension_image_l = [".heic", ".jpg", ".jpeg", ".png"]; extension_image_l += [s.upper() for s in extension_image_l]
extension_video_l = [".mov", ".mp4", ".mpg"];  extension_video_l += [s.upper() for s in extension_video_l]

timelimit_seconds = 10 

## Functions

# Custom exception for the timeout
# https://stackoverflow.com/questions/51712256/how-to-skip-to-the-next-input-if-time-is-out-in-python
class TimeoutException(Exception):
    pass

# Handler function to be called when SIGALRM is received
def sigalrm_handler(signum, frame):
    # We get signal!
    raise TimeoutException()

def media_typeid(fn):
    image = 0; video = 0
    for extension in extension_image_l:
        if fn.endswith(extension):
            return "image"
    if image == 0:
        for extension in extension_video_l:
            if fn.endswith(extension):
                return "video"
    return "unknown"  


# - identify file creation date (file or exif data)

def file_creationdate(fn):
    fname = pathlib.Path(fn)
    mtime = datetime.datetime.fromtimestamp(fname.stat().st_mtime)
    return [str(mtime.year), str(mtime.month).zfill(2)]
    
def file_datepatternid(fn):
    
    pattern1 = re.compile(r"^20\d{6}_[a-zA-Z0-9_ ]*\.jp(e)?g$")
    pattern2 = re.compile(r"^20\d{2}-\d{2}-\d{2} [a-zA-Z0-9-() ]*.jp(e)?g$")
    pattern3 = re.compile(r"^IMG-20\d{6}-[a-zA-Z0-9_ ]*\.jp(e)?g$")
    pattern4 = re.compile(r"^IMG_20\d{6}_[a-zA-Z0-9_ ]*\.jp(e)?g$")
    
    if pattern1.match(fn):
        year = fn[:4]
        mo   = fn[4:6]
        #print("! datepattern found -", fn)
        return [year, mo, fn]
    elif pattern2.match(fn):
        year = fn.split("-")[0]
        mo   = fn.split("-")[1]
        #print("! datepattern found -", fn)
        return [year, mo, fn]
    elif pattern3.match(fn) or pattern4.match(fn):
        year = fn[4:8]
        mo = fn[8:10]
        #print("! datepattern found -", fn)
        return [year, mo, fn]
    return ["", "", fn]

def image_creationdate(fn):
    """
    Returns year and month based on either
      1) file name
      2) exif
      3) creation date 
    """
    
    # - attempt to get date from filename
    creationdate = file_datepatternid(os.path.basename(fn))
    if creationdate[0] != "":
        return ["filename"]+creationdate[:-1]
    
    # - try to extract exif
    try:    
        exif = Image.open(fn)._getexif()
        if exif is None:
            return ["creationdate"]+file_creationdate(fn) 
        elif 36867 in exif:
            yearmo_candidate = exif[36867].split()[0].split(":")[:2]
            if yearmo_candidate[0] != "": return ["exif"] + yearmo_candidate
        return ["creationdate"]+file_creationdate(fn) 
    except:
        return ["creationdate"]+file_creationdate(fn) 

def video_creationdate(fn):
    """
    Returns year and month based on either
      1) file name
      2) creation date 
    """
    
    # - attempt to get date from filename
    creationdate = file_datepatternid(os.path.basename(fn))
    if creationdate[0] != -1:
        return ["filename"]+creationdate[:-1]
    else:
        return ["creationdate"]+file_creationdate(fn)

def media_info(media_fn):
    
    mediatype = media_typeid(media_fn)

    if mediatype=="image":

        image = Image.open(media_fn)
        try:
            image_hash = str(dhash.dhash_int(image, size=8))
        except:
            image_hash = "FAILED"
            
        image_filesize = os.path.getsize(media_fn)
        image_date = image_creationdate(media_fn) 

        return( 
            {
                "folder": os.path.dirname(media_fn),
                "basename": os.path.basename(media_fn),
                "type": "image",
                "dhash8": image_hash,
                "filesize": image_filesize,
                "datecreation_source": image_date[0],
                "datecreation_year":   image_date[1],
                "datecreation_month":  image_date[2],
                })

    elif mediatype=="video":

        video_filesize = os.path.getsize(media_fn)
        video_date = video_creationdate(media_fn) 

        return(
            {
                "folder": os.path.dirname(media_fn),
                "basename": os.path.basename(media_fn),
                "type": "video",
                "dhash8": "",
                "filesize": video_filesize,
                "datecreation_source": video_date[0],
                "datecreation_year":   video_date[1],
                "datecreation_month":  video_date[2],
                })

    else:
        
        return(
            {
                "folder": os.path.dirname(media_fn),
                "basename": os.path.basename(media_fn),
                "type": "other",
                "dhash8": "",
                "filesize": 0,
                "datecreation_source": "NA",
                "datecreation_year":   "0000",
                "datecreation_month":  "00",
                })


def media_createdb_backup012023(mediafolder_in, skipfolder=["Corbeille"]):
    """
    - mediafolder_in      folder where source media are located
    """    

    ## List media and get info
    media_info_d = {}
    
    for root, subdirs, files in os.walk(mediafolder_in):
        
        print(">", root)
        # -  if folder contains skipfolder, skip
        for _ in skipfolder:
            if _ in root: continue
        
        # - collect media data
        for filename in files:
            
            # - if filename begins with ., skip
            if filename.startswith("."): continue

            # - create complete path from root and filename
            fn = os.path.join(root, filename)
            
            # - collect info
            info = media_info(fn)
            
            media_info_d[fn] = info
            
    return media_info_d


def media2backupscript(mediafolder_in, imagefolder_out, videofolder_out, script_fn, escape_space=1, force=0):
    """
    - mediafolder_in      folder where source media are located
    - imagefolder_out     folder where backup images are stored
    - videofolder_out     folder where backup videos are stored
    """
    
    ## List media already stored
    imagefolder_map_s = set(mediafolder_map("/Users/matthieupichaud/pCloud Drive/Photos/").keys())
    videofolder_map_s = set(mediafolder_map("/Users/matthieupichaud/pCloud Drive/Videos/").keys())
    
    ## List media
    with open(script_fn, "wt") as fo:
        print("set +e", file=fo)
        for root, subdirs, files in os.walk(mediafolder_in):
            for filename in files:

                if not force:
                    if filename in imagefolder_map_s or filename in videofolder_map_s: continue
                
                # Set up signal handler for SIGALRM, saving previous value
                old_handler = signal.signal(signal.SIGALRM, sigalrm_handler)
                # Start timer
                signal.alarm(timelimit_seconds)

                try:

                        fn = os.path.join(root, filename)

                        # identify media type
                        mediatype = media_typeid(fn)   

                        if mediatype=="image":
                            creationdate = image_creationdate(fn)[1:]
                            target_folder = os.path.join(imagefolder_out, creationdate[0], creationdate[1])
                            if escape_space: fn_ = fn.replace("(", "\(").replace(")", "\)") #.replace(' ', '\ ')
                            else: fn_ = fn
                            if "'" not in fn_: print(f"mkdir -p '{target_folder}'; rsync -a -v --ignore-existing '{fn_}' '{target_folder}/'", file=fo)                
                            else:              print(f'mkdir -p "{target_folder}"; rsync -a -v --ignore-existing "{fn_}" "{target_folder}/"', file=fo)  

                        if mediatype=="video":
                            fndt = datetime.datetime.fromtimestamp(os.path.getmtime(fn))
                            creationdate = video_creationdate(fn)[1:]
                            target_folder = os.path.join(videofolder_out, creationdate[0], creationdate[1])
                            if escape_space: fn_ = fn.replace("(", "\(").replace(")", "\)") #.replace(' ', '\ ')
                            else: fn_ = fn
                            if "'" not in fn_: print(f"mkdir -p '{target_folder}'; rsync -a -v --ignore-existing '{fn_}' '{target_folder}/'", file=fo)  
                            else:              print(f'mkdir -p "{target_folder}"; rsync -a -v --ignore-existing "{fn_}" "{target_folder}/"', file=fo)  

                except TimeoutException:
                    print(f"#!!! Could not read {fn}")

                finally:
                    # Turn off timer
                    signal.alarm(0)
                    # Restore handler to previous value
                    signal.signal(signal.SIGALRM, old_handler)
    
    return 1


## ====================================================================================
##   HEIC files
##   - helps convert HEIC files in jpeg
## ====================================================================================

def heic2jpg(fn):

    # - convert heic to jpg
    print(f"# Convert heic file to jpg - {fn}")
    basename, ext = os.path.splitext(fn)
    register_heif_opener()
    image = Image.open(fn)
    exif = image.info['exif']
    image.save(basename+".converted.jpg", "JPEG", exif=exif)
    fn = basename+".converted.jpg"
        
    return fn

def mediafolder_heicconvert(media_folder):
    for root, subdirs, files in os.walk(media_folder):
        for filename in files:
            basename, ext =  os.path.splitext(filename)
            if ext.lower()==".heig" or ext.lower()==".heic":
                fn = os.path.join(root, filename)
                try:
                    heic2jpg(fn)
                except:
                    print("# could not convert", fn)
                
    return 1


# Media info database

## Functions

In [3]:
## ================================================================================
##    media db
## ================================================================================

def media_todb_old(media_folder, skipfolder=["Corbeille"], media_info_d={}, outputtype="dict"):
    """
    Input
    - media_folder      folder where source media are located
    
    Output
    - DataFrame         pandas dataframe that contains information about each media file
    """    

    print("media_todb")
    
    # - get basename from media_info_d
    basename_original_l = media_info_df["basename"].to_list()
    
    # - list media and get info
    for root, subdirs, files in os.walk(media_folder):
        
        print(">", root)
        # -  if folder contains skipfolder, skip
        for _ in skipfolder:
            if _ in root: continue
        
        # - files --> complete path
        fn_l = [os.path.join(root, basename) for basename in files if basename not in basename_original_l]
        
        # - collect media data
        media_info_d = media_filelist2infodict(fn_l, media_info_d)
        
    if outputtype=="dict":
        return media_info_d
    
    elif outputtype=="dataframe":
        # - create dataframe
        return pd.DataFrame\
                    .from_dict(media_info_d, orient='index')\
                    .reset_index().drop(columns=["index"])\
                    .sort_values(["dhash8", "basename", "folder"])
    else:
        return 0


def media_filelist2infodict(fn_l, media_info_d = {}):

    # - collect media data
    for ifn in trange(len(fn_l)): 

        fn = fn_l[ifn]

        # - if filename begins with ., skip
        if fn.startswith("."): continue
        if fn.endswith(".tsv") or fn.endswith(".tsv.gz"): continue

        # - collect info
        try:
            media_info_d[fn] = media_info(fn)
        except:
            print(f"# problem with {fn}")
        
    # - create dataframe
    return media_info_d



## ================================================================================
##    dhash8
## ================================================================================

def mediadb_imagecompare(mediadb_df, bitdiff_target=10):
    
    # - filter images
    mediadb_df = mediadb_df[mediadb_df["type"]=="image"]
    
    # - extract hash8 data
    dhash8_d = mediadb_df[["basename", "dhash8"]]\
               .set_index("basename")\
               .to_dict()["dhash8"]

    k_l = list(dhash8_d.keys())
    
    # - compare
    bitdiff_l = []
    
    for ik1 in range(len(k_l)-1):
        dh1 = dhash8_d[k_l[ik1]]
        if dh1=="FAILED": continue
        h1 = int(dh1)
        
        for ik2 in range((ik1+1), len(k_l)):
            dh2 = dhash8_d[k_l[ik2]]
            if dh2=="FAILED": continue
            h2 = int(dh2)

            bitdiff = dhash.get_num_bits_different(h1, h2)
            if bitdiff <= bitdiff_target:
                bitdiff_l.append([
                    min([k_l[ik1], k_l[ik2]]),
                    max([k_l[ik1], k_l[ik2]]),
                    bitdiff]
                )
    
    return bitdiff_l

def mediafolder_map_old(media_folder):
    
    ## - list media files
    fn_l = []
    for root, subdirs, files in os.walk(media_folder):
        fn_l = fn_l + [os.path.join(root, filename) 
                for filename in files
                if media_typeid(os.path.join(root, filename)) in ["image", "video"]]
    print(len(fn_l))
    
    ## - map media files
    map_d = {os.path.basename(fn):fn for fn in fn_l}
    
    return map_d   

def media_todb(media_folder, skipfolder=["Corbeille"], media_info_d={}, outputtype="dict"):
    """
    Input
    - media_folder      folder where source media are located
    
    Output
    - DataFrame         pandas dataframe that contains information about each media file
    """    

    # - get basename from media_info_d
    basename_original_s = {v["basename"] for v in media_info_d.values()}
    print("# N_entries_existing:", len(basename_original_s))
    #basename_original_l = media_info_df["basename"].to_list()
    
    # - list media and get info
    for root, subdirs, files in os.walk(media_folder):
        
        print(">", root)
        # -  if folder contains skipfolder, skip
        for _ in skipfolder:
            if _ in root: continue
        
        # - files --> complete path
        
        basename_all_s = set(files)
        print("# N_files:", len(basename_original_s))
        fn_l = [os.path.join(root, basename) for basename in (basename_all_s - basename_original_s)]
        print("# N_entries_tocompute:", len(fn_l))
        # - collect media data
        media_info_d = media_filelist2infodict(fn_l, media_info_d)
        
    if outputtype=="dict":
        return media_info_d
    
    elif outputtype=="dataframe":
        # - create dataframe
        return pd.DataFrame\
                    .from_dict(media_info_d, orient='index')\
                    .reset_index().drop(columns=["index"])\
                    .sort_values(["dhash8", "basename", "folder"])
    else:
        return 0

def mediafolder_map(media_folder):
    
    ## - list media files
    fn_l = []
    for root, subdirs, files in os.walk(media_folder):
        fn_l = fn_l + [os.path.join(root, filename) 
                for filename in files
                if media_typeid(os.path.join(root, filename)) in ["image", "video"]]
    print(len(fn_l))
    
    ## - map media files
    map_d = {os.path.basename(fn):fn for fn in fn_l}
    
    return map_d      

## ================================================================================
##    file stuffs
## ================================================================================

def gz_Nlines(input_fn):
    with gzip.open(input_fn, "r") as fi:
        for iline, line in enumerate(fi): pass
    return iline

def folder_Nmedia(input_folder):
    Nmedia_d = {}
    for root, subdirs, files in os.walk(input_folder):
        for filename in files:
            try:
                fn = os.path.join(root, filename)
                # identify media type
                mediatype = media_typeid(fn)   
                Nmedia_d[mediatype] = Nmedia_d.get(mediatype, 0) + 1
            except TimeoutException:
                print(f"#!!! Could not read {fn}")
    return Nmedia_d

def folder_flatten_old(folder):
    all_files = []
    first_loop_pass = True
    for root, _dirs, files in os.walk(folder):
        if first_loop_pass:
            first_loop_pass = False
            continue
        for filename in files:
            all_files.append(os.path.join(root, filename))
    for filename in all_files:
        shutil.move(filename, folder)
    return 1

def folder_flatten(folder, script_fn, prefix=[]):
    # - create script
    with open(script_fn, "wt") as fo:
        
        files_tomove_l = []
        folder_torm_l = []
        
        first_loop_pass = True
        for root, _dirs, files in os.walk(folder):
            prefix_flag=0
            if first_loop_pass:
                first_loop_pass = False
                continue
            
            # - test if folder contains prefix
            if len(prefix)==0: prefix_flag=1
            else:
                rootbase=os.path.basename(root)
                for _ in prefix:
                    if rootbase.startswith(_):
                        prefix_flag=1
                        pass
            # - list files that need to be moved
            if prefix_flag:
                for filename in files:
                    files_tomove_l.append(os.path.join(root, filename))
                folder_torm_l.append(root)
                
        # - create script to move files
        for filename in files_tomove_l:
            print(f"mv '{filename}' '{folder}/'", file=fo)
        for folder in folder_torm_l:
            print(f"rm -d '{folder}'", file=fo)
            
    return script_fn

def folder_listfiles(folder):
    return [os.path.join(dp, f) for dp, dn, filenames in os.walk(folder) for f in filenames]

def folder_listnumericsubfolder(folder):
    return [o for o in os.listdir(folder) 
                    if os.path.isdir(os.path.join(folder,o)) and o.isdigit()]

## Initialize media info db from structured media repository

In [None]:
media_repo = "/Users/matthieupichaud/pCloud Drive/Photos/"

# - create media db
for year in ["2022"]: #["0000"] + [str(_) for _ in range(2000, 2022)]:
    #for month in ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]:
    for month in ["07", "08", "09", "10", "11", "12"]:
        folder_candidate = os.path.join(f"{media_repo}/{year}/{month}")
        mediainfodb_candidate = os.path.join(
                                    folder_candidate,
                                    f"media_info_{year}{month}.tsv.gz")
        flag_compte = 0
        if not os.path.exists(folder_candidate):  continue
        if os.path.exists(mediainfodb_candidate):
            Nlines  = gz_Nlines(mediainfodb_candidate)
            Nimages = folder_Nmedia(folder_candidate)["image"]
            Ncoverage = Nlines/Nimages
            ratio = (Nlines/Nimages)
            if 0: #ratio >= .95:
                print(f"# {mediainfodb_candidate} found (Nimages: {Nimages}, ratio:{ratio}) - skip.");
                continue
            else:
                print(f"# {mediainfodb_candidate} found (Nimages: {Nimages}, ratio:{ratio}) - compute again.");
                
        media_info_df = media_todb(folder_candidate, outputtype="dataframe")
        media_info_df.to_csv(
                            mediainfodb_candidate,
                            sep="\t", index=False)

## Reset image folders
To copy images and remove imagecluster folders in media folders  
==> flattens image folder structure

In [None]:
media_repo = "/Users/matthieupichaud/pCloud Drive/Photos/"

In [None]:
for year in folder_listnumericsubfolder(media_repo):
    for month in folder_listnumericsubfolder(os.path.join(media_repo, year)):
        
        media_folder = os.path.join(media_repo, year, month)
        # - flatten folder
        folder_flatten(media_folder, "flatten.sh", prefix=["imagecluster_"])
        !bash flatten.sh
        print("# - folder flattened", media_folder)
        break
    break

In [None]:
!pwd

## Reset media info db

In [None]:
media_repo = "/Users/matthieupichaud/pCloud Drive/Photos/"
year="2022"
month="11"

In [None]:
folder_candidate = os.path.join(f"{media_repo}/{year}/{month}")
mediainfodb_candidate = os.path.join(
                            folder_candidate,
                            f"media_info_{year}{month}.tsv.gz")

if os.path.exists(folder_candidate):
    
    Nimages = folder_Nmedia(folder_candidate)["image"]
    print(f"# {mediainfodb_candidate} found (Nimages: {Nimages}) - compute.");

    media_info_df = media_todb(folder_candidate, media_info_d={}, outputtype="dataframe")
    media_info_df.to_csv(
                        mediainfodb_candidate,
                        sep="\t", index=False)

In [None]:
    media_info_df.to_csv(
                        mediainfodb_candidate.replace(".gz", ""),
                        sep="\t", index=False)

## >>> Create and run backup script

In [4]:
# - arguments
media_repo      = "/Users/matthieupichaud/Work/202104_photoautosort/tmp/"
imagefolder_out = "/Users/matthieupichaud/pCloud Drive/Photos"
videofolder_out = "/Users/matthieupichaud/pCloud Drive/Videos"

In [5]:
# - convert and copy HEIC
mediafolder_heicconvert(media_repo)

1

In [11]:
# - create backup script
media2backupscript(media_repo, imagefolder_out, videofolder_out, "backup.sh", escape_space=0, force=1)

94354
7758


1

In [9]:
!pwd

/Users/matthieupichaud/Work/202104_photoautosort


In [None]:
# - run script
!bash backup.sh > /dev/null

In [None]:
query_map_d = mediafolder_map(media_repo)
imagefolder_map_d = mediafolder_map(imagefolder_out)
videofolder_map_d = mediafolder_map(videofolder_out)

set(query_map_d.keys()) - set(imagefolder_map_d.keys()) - set(videofolder_map_d.keys()) 

## >>> Update media db

In [14]:
#media_repo = "/Users/matthieupichaud/pCloud Drive/test/"
media_repo = "/Users/matthieupichaud/pCloud Drive/Photos"

# - update media db
for year in ["2018"]: #folder_listnumericsubfolder(media_repo):
    for month in folder_listnumericsubfolder(os.path.join(media_repo, year)):
        
        folder_candidate = os.path.join(f"{media_repo}/{year}/{month}")
        mediainfodb_candidate = os.path.join(
                                    folder_candidate,
                                    f"media_info_{year}{month}.tsv.gz")
        if not os.path.exists(mediainfodb_candidate):
            mediainfodb_candidate=mediainfodb_candidate.replace(".gz","")
        
        if not os.path.exists(folder_candidate):  continue
        if not os.path.exists(mediainfodb_candidate) : continue
        
        print("="*100+"\n"+folder_candidate)
        
        # - file mapping
        fn_d = mediafolder_map(folder_candidate)
        #fn_l = folter_listfiles(folder_candidate)
        #fn_d = {os.path.basename(_):_ for _ in fn_l}
        print("# - file mapping created")
        
        # - read original media info db
        mediainfo_original_df = pd.read_csv(mediainfodb_candidate, sep="\t")
        mediainfo_original_d = mediainfo_original_df.to_dict(orient="index")
        print("# - mediainfo collected")
        
        # - update media info db with new images
        basename_original_l = mediainfo_original_df["basename"].to_list()
        basename_new_l = set(fn_d.keys()) - set(basename_original_l)
        if len(basename_new_l)==0: continue # skip processing if there is no new media file in media folder
        
        print("# - missing media from media info:", set(basename_original_l) - set(fn_d.keys()))
        
        mediainfodb_new_df = media_todb(
                                        folder_candidate,
                                        skipfolder=["Corbeille"],
                                        media_info_d = mediainfo_original_d,
                                        outputtype="dataframe")
        print("# - mediainfodb dataframe updated")
        
        # - compare images from media info db
        image_similar_l = mediadb_imagecompare(
                                        mediainfodb_new_df,
                                        bitdiff_target=10)
        print("# - images compared")
        
        # - init image clusters
        cluster_d = {_:i for i,_ in enumerate(list(fn_d.keys()))}
        cluster_content_d = {}
        for fn, i in cluster_d.items():
            cluster_content_d[i] = [fn]
        print("# - cluster initialized")
        # - cluster similar images
        for fn1, fn2, bitscore in image_similar_l:
            size1 = mediainfodb_new_df[mediainfodb_new_df["basename"]==fn1].filesize.values[0]
            size2 = mediainfodb_new_df[mediainfodb_new_df["basename"]==fn2].filesize.values[0]
            cluster1 = cluster_d[fn1]
            cluster2 = cluster_d[fn2]
            if cluster_d[fn1]==cluster_d[fn2]: continue
            if size1 <= size2: cluster_ = cluster2; cluster_drop = cluster1
            else:              cluster_ = cluster1; cluster_drop = cluster2
            cluster_content = cluster_content_d[cluster1]+cluster_content_d[cluster2]
            for fn in cluster_content: cluster_d[fn] = cluster_
            cluster_content_d[cluster_] = cluster_content
            del cluster_content_d[cluster_drop]
        print("# - clusters identified")
        
        # - flatten folder
        folder_flatten(folder_candidate, "flatten.sh", prefix=["imagecluster_"])
        !bash flatten.sh
        
        # - update file mapping after flatten folder
        fn_l = folder_listfiles(folder_candidate)
        fn_d = {os.path.basename(_):_ for _ in fn_l}
        print("# - file mapping updated")
        
        # - create folders for each cluster
        script_fn = "cluster.sh"
        icluster = 0
        with open(script_fn, "wt") as fo:
            print("set +e", file=fo)
            for k in cluster_content_d.keys():
                if len(cluster_content_d[k])>1:
                    print(k, cluster_content_d[k])
                    # - create cluster folder
                    cluster_folder = os.path.join(folder_candidate, f"imagecluster_{icluster}")
                    #os.mkdir(cluster_folder)
                    print(f"mkdir '{cluster_folder}'", file=fo)
                    # - copy files
                    for fn in cluster_content_d[k]:
                        #shutil.move(fn_d[fn], cluster_folder)
                        print(f"mv '{fn_d[fn]}' '{cluster_folder}'", file=fo)
                    icluster+=1
        !bash cluster.sh
        print("# - cluster folders created")
        
        # - write media info db
        mediainfodb_new_df.to_csv(
                            mediainfodb_candidate,
                            sep="\t", index=False)
        print("# - mediainfodb dataframe written to file")
    
    

/Users/matthieupichaud/pCloud Drive/Photos/2018/03
384
# - file mapping created
# - mediainfo collected
# - missing media from media info: set()
# N_entries_existing: 378
> /Users/matthieupichaud/pCloud Drive/Photos/2018/03
# N_files: 378
# N_entries_tocompute: 7


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  3.56it/s]

# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified





# - file mapping updated
0 ['20180324_124832.jpg', 'IMG-20180324-WA0005.jpg']
2 ['20180304_134028.jpg', 'IMG-20180304-WA0000.jpg']
7 ['20180313_073158.jpg', 'IMG-20180313-WA0001.jpg']
9 ['20180313_090131.jpg', 'IMG-20180313-WA0002.jpg']
37 ['20180321_133616.jpg', 'IMG-20180321-WA0005.jpg']
40 ['20180321_162044.jpg', 'IMG-20180321-WA0006.jpg']
72 ['20180323_114115.jpg', '20180323_114116.jpg']
87 ['20180324_151811.jpg', 'IMG-20180324-WA0008.jpg']
226 ['20180323_105801_001.jpg', 'IMG-20180323-WA0000.jpg', 'IMG-20180325-WA0002.jpg', 'IMG-20180325-WA0003.jpg', 'IMG-20180325-WA0004.jpg']
288 ['DSC05144.JPG', 'DSC05144.jpg']
290 ['DSC05147.JPG', 'DSC05148.JPG', 'DSC05147.jpg']
292 ['DSC05152.JPG', 'DSC05152.jpg']
293 ['DSC05153.JPG', 'DSC05153.jpg']
294 ['DSC05154.JPG', 'DSC05154.jpg']
295 ['DSC05159.JPG', 'DSC05159.jpg']
321 ['IMG-20180324-WA0001.jpg', 'IMG-20180325-WA0001.jpg']
345 ['IMG-20180313-WA0011.jpg', 'IMG-20180313-WA0012.jpg']
382 ['20180308_152138.jpg', 'IMG-20180308-WA0006.jpg']


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:24<00:00,  1.83it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_0
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_1
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_2
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_3
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_4
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_5
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_6
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_7
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_8
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_9
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_10
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_11
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_12
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_13
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_14
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_15
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_16
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_17
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_18
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]

>




 /Users/matthieupichaud/pCloud Drive/Photos/2018/07/imagecluster_19
# N_files: 318
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
276 ['IMG-20180704-WA0002.jpg', 'IMG-20180704-WA0006.jpg', 'IMG-20180705-WA0000.jpg']
289 ['20180711_194738.jpg', '20180711_194740(0).jpg', '20180711_194740.jpg']
290 ['20180713_130802.jpg', '20180713_130800.jpg', '20180713_130803.jpg', 'IMG-20180714-WA0000.jpg']
293 ['20180715_141604.jpg', '20180715_141605.jpg']
297 ['20180716_184913_002.jpg', '20180716_184913_001.jpg', '20180716_184913_003.jpg', '20180716_184913_004.jpg']
299 ['20180717_125723.jpg', 'IMG-20180731-WA0002.jpg']
300 ['20180708_170135_001.jpg', '20180708_170135_002.jpg']
302 ['20180720_102633.jpg', '20180720_102635.jpg']
304 ['20180719_095522.jpg', '20180719_095525.jpg']
306 ['20180731_114743.jpg', '20180731_114744.jpg', 'IMG-20180731-WA0013.jpg']
308 ['20180721_092650.jpg', '20180728_074102.jpg', '20180728_074134.jpg']
327 ['20180702_151526_001.jpg', '20180702_151526_002.jpg', '20180702_151526_0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:11<00:00,  1.87it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_0
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_1
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_2
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_3
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_4
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_5
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_6
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_7
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_8
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_9
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_10
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_11
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_12
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_13
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_14
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_15
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_16
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_17
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_18
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_19
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_20
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_21
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_22
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_23
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_24
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_25
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_26
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_27
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_28
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_29
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_30
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_31
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_32
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_33
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_34
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_35
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/12/imagecluster_36
# N_files: 346
# N_entries_tocompute: 0


0it [00:00, ?it/s]

# - mediainfodb dataframe updated





# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
306 ['20181225_083629.jpg', '20181225_083633.jpg']
308 ['20181225_104233.jpg', 'IMG-20181225-WA0008.jpg']
309 ['20181225_170849.jpg', 'IMG-20181225-WA0023.jpg']
310 ['20181202_113540.jpg', 'IMG-20181202-WA0001.jpg']
311 ['20181230_090538.jpg', 'IMG-20181230-WA0000.jpg']
312 ['20181223_123408.jpg', 'IMG-20181223-WA0007.jpg']
313 ['20181223_104935.jpg', 'IMG-20181223-WA0005.jpg']
314 ['20181224_192529.jpg', '20181224_192531.jpg', 'IMG-20181225-WA0014.jpg']
316 ['20181210_063437.jpg', '20181210_063441.jpg']
318 ['20181212_142929.jpg', '20181212_142931.jpg']
320 ['20181206_155103.jpg', '20181206_155105.jpg']
322 ['20181225_182954.jpg', 'IMG-20181225-WA0020.jpg']
323 ['20181223_112624.jpg', '20181223_112625.jpg', 'IMG-20181223-WA0006.jpg']
325 ['20181224_123514.jpg', 'IMG-20181224-WA0003.jpg']
326 ['20181223_105148.jpg', 'IMG-20181223-WA0004.jpg']
332 ['IMG_2644.converted.jpg', 'IMG_2644.jpg']
345 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.76it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_0
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_1
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_2
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_3
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_4
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_5
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/06/imagecluster_6
# N_files: 286
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
270 ['IMG-20180626-WA0000.jpg', 'IMG-20180626-WA0004.jpg']
278 ['20180626_071413.jpg', '20180626_071414.jpg', '20180626_071416.jpg']
281 ['20180620_164215_001.jpg', 'IMG-20180620-WA0002.jpg']
282 ['20180606_112424.jpg', 'IMG-20180606-WA0000.jpg']
297 ['20180608_211737.jpg', 'IMG-20180609-WA0005.jpg', 'IMG-20180610-WA0000.jpg', 'IMG-20180616-WA0002.jpg']
298 ['20180617_170952.jpg', 'IMG-20180617-WA0004.jpg']
299 ['20180620_163920_006.jpg', 'IMG-20180620-WA0001.jpg']
300 ['2018.6.6 600.JPG', 'IMG-20180606-WA0009.jpg']
# - cluster folders created
# - mediainfodb dataframe written to file
/Users/matthieupichaud/pCloud Drive/Photos/2018/01
232
# - file mapping created
# - mediainfo collected
# - missing media from media info: {'20180111_124411.jpg'}
# N_entries_existing: 141
> /Users/matthieupichaud/pCloud Drive/Photos/2018/01
# N_files: 141
# N_entries_tocompute: 6

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:40<00:00,  1.61it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_0
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_1
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_2
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_3
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_4
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_5
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_6
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_7
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_8
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_9
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_10
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_11
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_12
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/01/imagecluster_13
# N_files: 141
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
111 ['IMG-20180121-WA0002.jpg', 'IMG-20180121-WA0011.jpg']
122 ['IMG-20180105-WA0000.jpg', 'IMG-20180105-WA0001.jpg']
124 ['IMG-20180106-WA0003.jpg', 'IMG-20180108-WA0000.jpg']
125 ['IMG-20180119-WA0000.jpg', 'IMG-20180119-WA0007.jpg']
126 ['IMG-20180105-WA0003.jpg', 'IMG-20180105-WA0004.jpg']
127 ['IMG-20180101-WA0014.jpg', 'IMG-20180101-WA0019.jpg', 'IMG-20180101-WA0021.jpg']
131 ['IMG-20180101-WA0015.jpg', 'IMG-20180101-WA0017.jpg']
142 ['20180103_095406.jpg', 'IMG-20180103-WA0003.jpg']
148 ['20180119_110403.jpg', 'IMG-20180119-WA0004.jpg']
150 ['20180111_124359.jpg', 'IMG-20180111-WA0000.jpg']
158 ['20180105_084201.jpg', 'IMG-20180105-WA0002.jpg']
166 ['20180123_150534.jpg', 'IMG-20180123-WA0000.jpg']
186 ['20180119_175051.jpg', '20180119_175054.jpg']
198 ['20180101_015011.jpg', 'IMG-20180101-WA0018.jpg']
199 ['20180121_102915.jpg', 'IMG-20180121-WA0006.jpg

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:12<00:00,  1.52it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/02/imagecluster_0
# N_files: 208
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/02/imagecluster_1
# N_files: 208
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/02/imagecluster_2
# N_files: 208
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
202 ['IMG-20180218-WA0014.jpg', 'IMG-20180220-WA0000.jpg']
205 ['IMG-20180218-WA0007.jpg', 'IMG-20180218-WA0018.jpg']
206 ['IMG-20180223-WA0003.jpg', 'IMG-20180224-WA0000.jpg']
208 ['20180225_113454.jpg', 'IMG-20180225-WA0001.jpg']
# - cluster folders created
# - mediainfodb dataframe written to file
/Users/matthieupichaud/pCloud Drive/Photos/2018/04
263
# - file mapping created
# - mediainfo collected
# - missing media from media info: set()
# N_entries_existing: 217
> /Users/matthieupichaud/pCloud Drive/Photos/2018/04
# N_files: 217
# N_entries_tocompute: 27


 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 16/27 [00:10<00:05,  1.97it/s]

# problem with /Users/matthieupichaud/pCloud Drive/Photos/2018/04/20180420_114821.jpg


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:16<00:00,  1.61it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_0
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_1
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_2
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_3
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_4
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_5
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_6
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_7
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_8
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/04/imagecluster_9
# N_files: 217
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
197 ['IMG-20180422-WA0010.jpg', 'IMG-20180422-WA0012.jpg']
199 ['IMG-20180412-WA0001.jpg', 'IMG-20180413-WA0004.jpg']
202 ['IMG-20180412-WA0000.jpg', 'IMG-20180413-WA0009.jpg']
208 ['IMG-20180427-WA0004.jpg', 'IMG-20180429-WA0005.jpg']
209 ['IMG-20180429-WA0008.jpg', 'IMG-20180429-WA0011.jpg']
210 ['IMG-20180429-WA0007.jpg', 'IMG-20180429-WA0009.jpg', 'IMG-20180429-WA0010.jpg']
211 ['IMG-20180427-WA0001.jpg', 'IMG-20180429-WA0006.jpg']
214 ['IMG-20180406-WA0001.jpg', 'IMG-20180406-WA0002.jpg']
215 ['IMG-20180426-WA0001.jpg', 'IMG-20180427-WA0000.jpg']
241 ['20180409_063126.jpg', '20180409_063128.jpg']
# - cluster folders created
# - mediainfodb dataframe written to file
/Users/matthieupichaud/pCloud Drive/Photos/2018/05
485
# - file mapping created
# - mediainfo collected
# - missing media from media info: {'.IMG_4501.jpg.4R3Dt1'}
# N_entries_existing: 393
> /U

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:41<00:00,  1.54it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_0
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_1
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_2
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_3
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_4
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_5
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_6
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_7
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_8
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_9
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_10
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_11
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_12
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_13
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/05/imagecluster_14
# N_files: 393
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
361 ['IMG-20180506-WA0003.jpg', 'IMG-20180506-WA0012.jpg']
363 ['IMG-20180528-WA0015.jpg', 'IMG-20180530-WA0009.jpg']
368 ['IMG-20180522-WA0001.jpg', 'IMG-20180522-WA0002.jpg']
371 ['IMG-20180527-WA0006.jpg', 'IMG-20180527-WA0009.jpg']
373 ['IMG-20180527-WA0005.jpg', 'IMG-20180527-WA0008.jpg']
377 ['IMG-20180502-WA0002.jpg', 'IMG-20180502-WA0003.jpg']
380 ['IMG-20180529-WA0000.jpg', 'IMG-20180530-WA0013.jpg']
381 ['IMG-20180521-WA0003.jpg', 'IMG-20180521-WA0010.jpg', 'IMG-20180521-WA0011.jpg']
382 ['IMG-20180521-WA0001.jpg', 'IMG-20180521-WA0005.jpg']
384 ['IMG-20180521-WA0004.jpg', 'IMG-20180522-WA0004.jpg', 'IMG-20180530-WA0011.jpg']
386 ['IMG-20180530-WA0002.jpg', 'IMG-20180530-WA0004.jpg']
387 ['IMG-20180528-WA0000.jpg', 'IMG-20180530-WA0010.jpg']
390 ['IMG-20180506-WA0002.jpg', 'IMG-20180506-WA0013.jpg', 'IMG-20180507-WA0025.jpg']
393 ['20180523_161841.jpg

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.12it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_0
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_1
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_2
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_3
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_4
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_5
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_6
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_7
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_8
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_9
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_10
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_11
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_12
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_13
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_14
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_15
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_16
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_17
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_18
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_19
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_20
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_21
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_22
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_23
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_24
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_25
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]

> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_26
# N_files: 456
# N_entries_tocompute: 0



0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_27
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_28
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_29
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_30
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_31
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_32
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_33
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_34
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_35
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_36
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_37
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_38
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_39
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_40
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_41
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_42
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_43
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_44
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_45
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_46
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_47
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_48
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_49
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_50
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_51
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_52
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_53
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_54
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_55
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_56
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_57
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_58
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_59
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_60
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_61
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/08/imagecluster_62
# N_files: 456
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
101 ['20180831_183309.jpg', 'IMG-20180831-WA0004.jpg', 'IMG-20180831-WA0007.jpg', 'IMG-20180831-WA0009.jpg']
160 ['20180831_101841.jpg', 'IMG-20180831-WA0008.jpg']
188 ['20180821_133620(0).jpg', '20180821_133620.jpg']
349 ['IMG-20180831-WA0005.jpg', 'IMG-20180831-WA0006.jpg']
350 ['IMG-20180826-WA0000.jpg', 'IMG-20180826-WA0002.jpg']
352 ['IMG-20180828-WA0027.jpg', 'IMG-20180828-WA0032.jpg', 'IMG-20180830-WA0005.jpg']
393 ['IMG-20180828-WA0023.jpg', 'IMG-20180828-WA0025.jpg']
394 ['IMG-20180828-WA0030.jpg', 'IMG-20180831-WA0002.jpg']
396 ['IMG-20180819-WA0001.jpg', 'IMG-20180819-WA0003.jpg']
398 ['20180805_184430.jpg', '20180805_184433.jpg', 'IMG-20180806-WA0002.jpg']
400 ['20180803_112844.jpg', 'IMG-20180803-WA0007.jpg']
401 ['20180819_121548.jpg', 'IMG-20180819-WA0006.jpg']
402 ['20180810_125340.jpg', 'IMG-20180810-WA0007.jpg']
403 ['20180819_105952.jpg', 'IM

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:43<00:00,  1.89it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_0
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_1
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_2
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_3
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_4
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_5
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_6
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_7
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_8
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_9
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_10
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_11
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_12
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_13
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_14
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_15
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_16
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_17
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_18
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_19
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_20
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_21
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_22
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_23
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_24
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_25
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_26
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_27
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_28
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_29
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_30
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/09/imagecluster_31
# N_files: 347
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
41 ['20180920_181738.jpg', 'IMG-20180930-WA0013.jpg']
59 ['20180923_142617.jpg', 'IMG-20180930-WA0014.jpg']
66 ['20180923_144046.jpg', 'IMG-20180930-WA0015.jpg']
70 ['20180923_150527.jpg', 'IMG-20180930-WA0016.jpg']
76 ['20180930_140824.jpg', 'IMG-20180930-WA0004.jpg']
274 ['IMG-20180915-WA0000.jpg', 'IMG-20180915-WA0001.jpg']
291 ['IMG-20180901-WA0000.jpg', 'IMG-20180901-WA0001.jpg']
299 ['IMG-20180901-WA0009.jpg', 'IMG-20180901-WA0010.jpg', 'IMG-20180930-WA0007.jpg']
315 ['IMG-20180903-WA0001.jpg', 'IMG-20180903-WA0002.jpg', 'IMG-20180905-WA0001.jpg']
317 ['IMG-20180903-WA0004.jpg', 'IMG-20180903-WA0005.jpg']
318 ['IMG-20180903-WA0000.jpg', 'IMG-20180903-WA0006.jpg']
320 ['20180902_110517.jpg', '20180902_110518.jpg', 'IMG-20180930-WA0009.jpg']
323 ['20180923_135939.jpg', '20180923_135940.jpg', 'IMG-20180923-WA0002.jpg']
325 ['20180902_102325.jpg', '20180902_1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:16<00:00,  2.09it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_0
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_1
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_2
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_3
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_4
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_5
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_6
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_7
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_8
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_9
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_10
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_11
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_12
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_13
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_14
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_15
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_16
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_17
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_18
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_19
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_20
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_21
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_22
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_23
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_24
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_25
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_26
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_27
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_28
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_29
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_30
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_31
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_32
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_33
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_34
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_35
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_36
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_37
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_38
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_39
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/10/imagecluster_40
# N_files: 403
# N_entries_tocompute: 0


0it [00:00, ?it/s]

# - mediainfodb dataframe updated





# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
67 ['IMG-20181031-WA0013.jpg', 'IMG_3992.JPG']
138 ['20181031_180712.jpg', 'IMG-20181031-WA0008.jpg']
184 ['20181031_181229.jpg', 'IMG-20181031-WA0007.jpg']
356 ['20181013_164920.jpg', '20181013_164929.jpg', '20181013_164912.jpg', '20181013_164933.jpg', '20181013_164918.jpg', '20181013_164936.jpg']
360 ['20181027_150238.jpg', '20181027_150237.jpg', '20181027_150241.jpg', 'IMG-20181027-WA0019.jpg', 'IMG-20181027-WA0020.jpg', 'IMG-20181027-WA0021.jpg']
361 ['20181020_154527.jpg', 'IMG-20181020-WA0004.jpg']
362 ['20181022_163313.jpg', 'IMG-20181022-WA0003.jpg']
363 ['20181022_151941.jpg', '20181022_151942.jpg']
365 ['20181026_145007.jpg', '20181026_145009.jpg']
367 ['20181026_164213_001.jpg', 'IMG-20181026-WA0009.jpg']
368 ['20181014_154602.jpg', '20181014_154603.jpg']
370 ['20181014_151028.jpg', '20181014_151032.jpg', 'IMG-20181015-WA0003.jpg']
372 ['20181026_161845.jpg', '20181026_161848.jpg', 

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.86it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_0
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_1
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_2
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_3
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_4
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_5
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_6
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_7
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_8
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_9
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_10
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_11
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_12
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_13
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_14
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_15
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_16
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_17
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_18
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_19
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_20
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_21
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_22
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_23
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_24
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


> /Users/matthieupichaud/pCloud Drive/Photos/2018/11/imagecluster_25
# N_files: 220
# N_entries_tocompute: 0


0it [00:00, ?it/s]


# - mediainfodb dataframe updated
# - images compared
# - cluster initialized
# - clusters identified
# - file mapping updated
169 ['IMG-20181104-WA0000.jpg', 'IMG-20181104-WA0001.jpg']
192 ['20181111_194525.jpg', '20181111_194527.jpg']
194 ['20181124_163043.jpg', 'IMG-20181125-WA0015.jpg']
195 ['20181121_174201.jpg', '20181121_174204.jpg']
197 ['20181126_152350.jpg', 'IMG-20181126-WA0003.jpg']
198 ['20181103_144328.jpg', '20181103_144329.jpg']
206 ['20181111_194522.jpg', 'IMG-20181112-WA0000.jpg']
207 ['20181124_121923.jpg', 'IMG-20181125-WA0011.jpg']
208 ['20181124_225535.jpg', 'IMG-20181125-WA0001.jpg']
209 ['20181124_225553.jpg', 'IMG-20181125-WA0000.jpg']
210 ['20181126_161041.jpg', 'IMG-20181126-WA0000.jpg']
211 ['20181126_152302.jpg', 'IMG-20181126-WA0004.jpg']
212 ['20181124_163553.jpg', 'IMG-20181125-WA0008.jpg']
213 ['20181124_110352.jpg', 'IMG-20181125-WA0013.jpg']
214 ['20181124_164808.jpg', 'IMG-20181125-WA0006.jpg']
215 ['20181124_163010.jpg', 'IMG-20181125-WA0009.jpg']
2

## Compare content

In [None]:
imagefolder_map_d = mediafolder_map("/Users/matthieupichaud/pCloud Drive/Photos/")
videofolder_map_d = mediafolder_map("/Users/matthieupichaud/pCloud Drive/Videos/")
query_map_d = mediafolder_map("/Volumes/TV/PhotosGoogleLN/Takeout/Google Photos")

In [None]:
set(query_map_d.keys()) - set(imagefolder_map_d.keys()) - set(videofolder_map_d.keys()) 

## Identify corrupted images

In [None]:
from PIL import Image, ImageFile
from collections import Counter
import numpy as np

ImageFile.LOAD_TRUNCATED_IMAGES = True

def image_colordistrib_test(fn):
    img = Image.open(fn)
    img.thumbnail((100, 100))
    im=img.getdata()
    im2arr = np.array(im)
    unique, counts = np.unique(im2arr, return_counts=True, axis=0)
    for k,v in zip(unique.tolist(), counts.tolist()):
        if tuple(k)==(128, 128, 128):
            Nitems = counts.sum()
            ratio= v/Nitems
            return ratio, v, Nitems, k
    return 0,0,0

#image_fn="/Users/matthieupichaud/Downloads/20180203_152235.jpg"
#image_fn="/Users/matthieupichaud/pCloud Drive/Photos/0000/00/DSCN2424.JPG"
image_fn="/Users/matthieupichaud/Downloads/20180218_140908.jpg"

image_colordistrib_test(image_fn)

In [None]:
imagefolder_map_d = mediafolder_map("/Users/matthieupichaud/pCloud Drive/Photos/")

In [None]:
# - initialize
qc_d={}

In [None]:
# - iterate
for i, [bn, fn] in enumerate(imagefolder_map_d.items()):
    if bn in qc_d.keys(): continue
    try:
        qc_d[bn]=image_colordistrib_test(fn)
        if i%100==0: print(i)
    except:
        qc_d[bn]=-1

In [None]:
kdrop=[k for k,v in qc_d.items() if v==-1]

In [None]:
for k in kdrop: del qc_d[k]

In [None]:
colordistrib_df=pd.DataFrame.from_dict(qc_d, orient="index")

In [None]:
recoverylist_s=set(colordistrib_df[colordistrib_df[0]>0.1].index.tolist())

In [None]:
for i in range(1,15):
    backup_zip=f"/Volumes/TV/PhotosGoogleLN/takeout-20221229T211606Z-0{str(i).zfill(2)}.zip"
    print(backup_zip)
    wd="/Users/matthieupichaud/Downloads/tmp_PhotosGoogleLN/" 
    # - unzip folder
    !ditto -x -k "{backup_zip}" "{wd}"
    # - list media
    query_map_d = mediafolder_map(wd)
    for k,v in query_map_d.items():
        if k in recoverylist_s:
            print(v, "found", len(recoverylist_s))
            !cp "{v}" tmp/
            recoverylist_s.remove(k)

In [None]:
recoverylist_s

In [None]:
image_colordistrib_test(imagefolder_map_d["20180301_065232.jpg"])

In [None]:
colordistrib_df[0].hist()

In [None]:
Image.open(imagefolder_map_d["20180301_065232.jpg"]).show()

In [None]:
import pickle

with open('qc_d.pickle', 'wb') as handle:
    pickle.dump(qc_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
len(qc_d)

In [None]:
# LN
backup_folder="/Volumes/TV/PhotosGoogleLN/"

# - arguments
imagefolder_out = "/Users/matthieupichaud/pCloud Drive/Photos"
videofolder_out = "/Users/matthieupichaud/pCloud Drive/Videos"

for suffix in ['001', '002', '003', '004','005', '006','008', '009','010', '011','012']:
    backup_zip=os.path.join(backup_folder, f"takeout-20221229T211606Z-{suffix}.zip")
    print(backup_zip)
    wd="/Users/matthieupichaud/Downloads/tmp_PhotosGoogleLN/" 
    # - unzip folder
    !ditto -x -k "{backup_zip}" "{wd}"
    # - convert and copy HEIC
    #mediafolder_heicconvert(wd)
    # - create backup script
    #media2backupscript(wd, imagefolder_out, videofolder_out, "backup.sh", escape_space=0)
    # - run script
    #!bash backup.sh > /dev/null
    # - control
    query_map_d = mediafolder_map(wd)
    imagefolder_map_d = mediafolder_map(imagefolder_out)
    videofolder_map_d = mediafolder_map(videofolder_out)
    print(set(query_map_d.keys()) - set(imagefolder_map_d.keys()) - set(videofolder_map_d.keys()))
    print("="*50)
    # - remove
    !rm -rf {wd}