# Description

This notebook provides some tools to compare the total files from two directories (incl subdirectories) on duplicates or missing elements.

Take care that the paths are existing otherwise the corresponding list will be empty.

-> Functions are defined at the end. 

In [132]:
#! /usr/bin/python3
import os

# work flow

In [137]:
dir1 = "G:\My Drive\Philipp\TUWIEN_catalysis_XRay\data_SLS_2015_May_XRD\checked_willDeleteIt_measurements_static_MERGED"
dir2 = 'G:/My Drive/Philipp/TUWIEN_catalysis_XRay/data_SLS_2015_May_XRD/measurements_static_MERGED_complete_needsOnlySuryey/'

print(dir1)
print(dir2)

### define file endings that should be solely included or omitted:
#endings = ['xye', 'xye_PAR', 'ini']
endings = ['xye_PAR']
endings = ['']
#endings = ['xye', 'ini']
endings = ['ini']

G:\My Drive\Philipp\TUWIEN_catalysis_XRay\data_SLS_2015_May_XRD\checked_willDeleteIt_measurements_static_MERGED
G:/My Drive/Philipp/TUWIEN_catalysis_XRay/data_SLS_2015_May_XRD/measurements_static_MERGED_complete_needsOnlySuryey/


In [143]:
### gives only files with specified ending(s)
##  note: if '' all files will be given

print(dir1)
print(dir2)

list1 = func_collectFilenamesInFolder_include(dir1, endings)
list2 = func_collectFilenamesInFolder_include(dir2, endings)
print('len list1    ', len(list1))
print('len list2    ', len(list2))
print('difference   ', abs(len(list1)-len(list2)) )

G:\My Drive\Philipp\TUWIEN_catalysis_XRay\data_SLS_2015_May_XRD\checked_willDeleteIt_measurements_static_MERGED
G:/My Drive/Philipp/TUWIEN_catalysis_XRay/data_SLS_2015_May_XRD/measurements_static_MERGED_complete_needsOnlySuryey/
len list1     1
len list2     30
difference    29


In [139]:
### gives only files with ending(s) different than specified
##  note: if '' all files will be given

print(dir1)
print(dir2)

list1 = func_collectFilenamesInFolder_exclude(dir1, endings)
list2 = func_collectFilenamesInFolder_exclude(dir2, endings)
print('len list1    ', len(list1))
print('len list2    ', len(list2))
print('difference   ', abs(len(list1)-len(list2)) )

G:\My Drive\Philipp\TUWIEN_catalysis_XRay\data_SLS_2015_May_XRD\checked_willDeleteIt_measurements_static_MERGED
G:/My Drive/Philipp/TUWIEN_catalysis_XRay/data_SLS_2015_May_XRD/measurements_static_MERGED_complete_needsOnlySuryey/
len list1     3601
len list2     3601
difference    0


In [144]:
### double sided check for missing

a = func_compare_missing(list1, list2)
print(len(a))
print(a)
b = func_compare_missing(list2, list1)
print(len(b))
print(b)

0
[]
0
[]


In [141]:
### double sided check for multiple

a = func_compare_multiple(list1, list2)
print(len(a))
print(a)
b = func_compare_multiple(list2, list1)
print(len(b))
print(b)

0
[]
0
[]


In [145]:
print(len(list1), len(list2))
missing1, missing2 = func_compare_complex(a = list1, b = list2)
len(missing1), len(missing2)

1 30


(0, 0)

# define functions

## collect filenames

In [None]:
def func_collectFilenamesInFolder_include(folder, fileTypes=[]):
        """
        Returns a list of strings with specific ending in a given folder.
        Searches in all level, i.e. also in subdirectories.
        """
        return [ file_ for root, _ , files in os.walk(folder) for file_ in files for fileType in fileTypes if file_.endswith(fileType)]

def func_collectFilenamesInFolder_exclude(folder, exclude=[]):
        """
        Returns a list of strings with specific ending in a given folder.
        Searches in all level, i.e. also in subdirectories.
        """
        return [ file_ for root, _ , files in os.walk(folder) for file_ in files if not file_.split('.')[-1] in exclude]

## examples for possible comparisons

In [146]:
def func_compare_missing(list1, list2):
    """
    """
    missing = []
    for i in range(len(list1)):
        count = 0
        for j in  range(len(list2)):
            if list1[i] == list2[j]:
                count += 1
        if count == 0:
            missing.append(list1[i])
    return missing

In [147]:
def func_compare_multiple(list1, list2):
    """
    """
    missing = []
    for i in range(len(list1)):
        count = 0
        for j in  range(len(list2)):
            if list1[i] == list2[j]:
                count += 1
        if count > 1:
            missing.append(list1[i])
    return missing

In [148]:
def func_compare_specific(list1, list2):
    """
    """
    missing = []
    for i in range(len(list1)):
        count = 0
        for j in  range(len(list2)): 
            if list1[i] == 'desktop.ini':
                count += 1
        if count > 0:
            missing.append(list1[i])
    return missing

## another idea for comparison

attention: this removes elements from the lists, potentially reducing the effective size but might have no advantage over the methods above

if elemts are set e.g. to zero there is no guarantee that another element was not zero too. '' could be an alternative 

In [149]:
def func_compare_complex(a = [1,2,3,4,5,6,0,0,0], b = [6,7,4,2,0,2,2,1]):
    """
    Here duplicates are removed from list2: b and
    then list1: a is checked itself for duplicates.
    Might be improved in terms of unnecessary checkings
    """
    
    a = a[:]
    b = b[:]
    for i in reversed(range(len(a))):
        count = 0
        for j in reversed(range(len(b))):
            if a[i] == b[j]:
                del b[j]
                count += 1
        comp = a[i]
        for k in reversed(range(len(a))):
            if a[k] == comp:
                a[k] = ''
        if count == 0 :
            a[i] = comp
        #else:
        #    a[i] = ''
    ### remove all '' from list1 to show only names that were not contained in the second list 
    a = [i for i in a if i != '']
        
    return a, b