# THE GREAT FILE DE-DUPLICATOR
### A solution to my fractured backups
A program of 1000 lines begins with dependency imports


In [1]:
from pathlib import Path
import multiprocess #multiprocessing if running outside of jupyter
from multiprocess import Process, Queue #multiprocessing if running outside of jupyter
import json
from collections import defaultdict
from pprint import pprint, pformat
import time
import logging
from queue import Empty

#import pandas as pd
import zlib
from collections import Counter
from IPython.display import clear_output

print('done')

done


### Enter your parameters
Enter where you want to output your analysis files, what directories you want to analyze for duplicates and which folder names you wish to exclude from the analysis.
Mind the backslashes playing nice with python on windows.

**THE ORDER THE deDupeDirs ARE IN ARE THE PREFERRED ORDER FOR RETENTION IF DRY RUN IS DISABLED! MAKE SURE YOU ARE OK WITH LATER ENTRIES IN THE LIST HAVING DUPLICATE FILES DISABLED!**


In [2]:
dedupeDirs = (r'C:\Users\Ryan\Desktop',
               #r'D:\Users\Ryan',
               'G:\\',
               'F:\\',
               #'H:\\',
               )
excludeDirs = {'$RECYCLE.BIN',
               'gdpr data',
               'steamapps',
               'Google Dump',
              }
excludeExtensions = {'.example',    
}
outputDir = Path(r'C:\Users\Ryan\Desktop')

#Assign file retention priority to dedupeDirs based on order
#Early entries in the list will keep their files over later entries with duplicate files
print(dedupeDirs)

('C:\\Users\\Ryan\\Desktop', 'G:\\', 'F:\\')


## Find duplicates based on file name
For a given directory listed in "deDupeDirs", get the filenames, last modified dates, and file size. 
End result of a list will be a dict with filename as keys and values that are lists of dicts of file paths and attributes

In [4]:
dupeSearchStart = time.time()
dedupePaths = [Path(x) for x in dedupeDirs]

nameSet = set()
files = []
dupeDict = defaultdict(list)
for wd in dedupePaths:
    print(f'running in {wd}')
    dirDupeSearchStart = time.time()
    for file in wd.glob('**/*'):
        if file.is_file() and not excludeDirs.intersection(set(file.parts)) and not excludeExtensions.intersection(set(file.suffixes)):
            fileDict = {'pathObj':file,
                        'path':file.as_posix(),
                        'size':file.stat().st_size,
                        'mtime':file.stat().st_mtime,
                        }
            try:
                dupeDict[file.name].append(fileDict)
            except:
                logging.exception(f"couldn't add this filedict, continuing \n {fileDict}")
                raise
    print('len of files parsed = ', len(dupeDict))
    print(f'processed {wd} in {time.time() - dirDupeSearchStart}s')

uniqueDict = {k:v for k,v in dupeDict.items() if len(v) == 1}
print('number of uniques = ', len(uniqueDict))
dupeDict   = {k:v for k,v in dupeDict.items() if len(v) > 1}
print('number of dupes = ', len(dupeDict))
print(f'dict load completed in {time.time() - dupeSearchStart}s')

running in C:\Users\Ryan\Desktop
len of files parsed =  41562
processed C:\Users\Ryan\Desktop in 23.478837728500366s
running in G:\
len of files parsed =  41562
processed G:\ in 0.0s
running in F:\
len of files parsed =  46694
processed F:\ in 66.96567034721375s
number of uniques =  29437
number of dupes =  17257
dict load completed in 90.46351528167725s


### Save your progress
The above steps could take a while depending on how much you've hoarded, save the output to a JSON file to noodle with at a later date

In [5]:
writingStart = time.time()
print('writing to file')    
dicts = (dupeDict, uniqueDict)
for d in dicts:
    for k,li in d.items():
        for file in li:
            del file['pathObj']
            
with open(outputDir.joinpath('duplicateFiles.json'),'w',encoding='utf8') as f:
    json.dump(dupeDict, f, indent = 1 )
    
with open(outputDir.joinpath('uniqueFiles.json'),'w',encoding='utf8') as f:
    json.dump(uniqueDict, f, indent = 1 )
print(f'done, took {time.time() - writingStart:.2f}s')


writing to file
done, took 1.00s


### Reload your progress
Reload if you saved to a JSON file.  You'll need to run imports and the parameter code blocks before this.

In [3]:
loadStart = time.time()
print('loading from file')    
with open(outputDir.joinpath('duplicateFiles.json'),'r',encoding='utf8') as f:
    dupeDict = json.load(f)
    
with open(outputDir.joinpath('uniqueFiles.json'),'r',encoding='utf8') as f:
    uniqueDict = json.load(f)
    
dicts = (dupeDict, uniqueDict)
for d in dicts:
    for k,li in d.items():
        for file in li:
            file['pathObj'] = Path(file['path'])
            
print(f'done, took {(time.time() - loadStart):.2f}s')

loading from file
done, took 0.92s


### Only run on exact duplicates first

In [4]:
def crc32(fileName, q, printQueue):
    import zlib
    try:
        with open(fileName, 'rb') as fh:
            hash = 0
            while True:
                s = fh.read(65536)
                if not s:
                    break
                hash = zlib.crc32(s, hash)
            returnVal = {'file':Path(fileName),'crc32':"%08X" % (hash & 0xFFFFFFFF)}
            #printQueue.put(returnVal)
            q.put(returnVal)
    except Exception as e:
        printQueue.put(f'error on {fileName}, {e}')

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}" 
        num /= 1024.0
    return "{num:.1f}Yi{suffix}"
        
progressTick = 20
fileIncrement = 1
maxProcesses = 8
runningProcesses = []
q = Queue()
printQ = Queue()
crcStart = time.time()
lenDupeGroups = len(dupeDict.values())
progressTick = lenDupeGroups % progressTick
print(f'starting {lenDupeGroups=}')

for dupeGroupCount, kv in enumerate(dupeDict.items()):
    filename,dupeList = kv
    #clear_output()
    #if dupeGroupCount % fileIncrement == 0:
    print(f'processing {dupeGroupCount=} of {lenDupeGroups=} for a % complete of {(dupeGroupCount/lenDupeGroups):.2f}')

    #only things in sizeGroup less than  get a CRC
    sizeGroup = defaultdict(list)
    for cnt,d in enumerate(dupeList):
        sizeGroup[d['size']].append(d['path'])
    #remove all where we don't have a duplicate  filesize
    #pprint(sizeGroup)
    #size less than 1000000000 ties to 1 gig
    sizeGroup = {k:v for k,v in sizeGroup.items() if len(v) > 1 and int(k) < 1000000000}
    #pprint(sizeGroup)
    
    for cnt,d in enumerate(dupeList):
        while len(multiprocess.active_children()) == maxProcesses:
            time.sleep(.1)
            print(multiprocess.active_children())

        print(f"processing {d['path']} {sizeof_fmt(d['size'])}")
        #print('passed to subprocess', d)
        process = multiprocess.Process(target=crc32, args =[d['path'],q, printQ])
        runningProcesses.append(process)
        process.start()
            
    #time.sleep(1)
    
        if cnt > 5:
            time.sleep(1)
            break
    if dupeGroupCount > 5:
        #time.sleep(1)
        break
            
    
while True:
    try:
        result = q.get(False)  #gotta use False otherwise get is blocking
        name      = result['path'].name
        posixPath = result['path'].as_posix
        found = False
        for d in dupeDict[name]:
            if d['path'] == posixPath:
                found = True
                d.update(result)
                print(d)
                break
        #print('q',result)
    except Empty:
        print('empty')

        #try:
        #    print(printQ.get(False))
        #except Empty:
        #    print('empty')
        #    break

        
    


starting lenDupeGroups=17257
processing dupeGroupCount=0 of lenDupeGroups=17257 for a % complete of 0.00
processing C:/Users/Ryan/Desktop/desktop.ini 282.0B
processing C:/Users/Ryan/Desktop/Files/Home and Money/Tax/2015 Tax/desktop.ini 136.0B
processing C:/Users/Ryan/Desktop/Files/Music/desktop.ini 440.0B
processing C:/Users/Ryan/Desktop/Files/photos/Old Photos/Trivia Pics/desktop.ini 136.0B
processing C:/Users/Ryan/Desktop/Files/Work/work git/SQL/desktop.ini 110.0B
processing F:/Files/Home and Money/Tax/2015 Tax/desktop.ini 136.0B
processing F:/Files/Music/desktop.ini 440.0B
processing dupeGroupCount=1 of lenDupeGroups=17257 for a % complete of 0.00
processing C:/Users/Ryan/Desktop/Files/Guide2DataMining.pdf 138.6MiB
processing F:/Files/Guide2DataMining.pdf 138.6MiB
processing dupeGroupCount=2 of lenDupeGroups=17257 for a % complete of 0.00
processing C:/Users/Ryan/Desktop/Files/rsync_backup.sh 43.0B
processing F:/Files/rsync_backup.sh 43.0B
processing dupeGroupCount=3 of lenDupeGroup

KeyboardInterrupt: 

In [None]:
startt = time.time()
time.sleep(.1)
print(time.time() - startt)

In [None]:
    input()
    process = multiprocessing.Process(target=crc32, args =(d['path']))
    runningProcesses.append(process)
    process.start()

    for p in runningProcesses:
        p.join()




    def my_func(arg):
        Q.put('Hello, ' + arg)

    p1 = Process(target=my_func, args=('John',))
    p1.start()
    print(Q.get())
    p1.join()
        
    #fileSizeMatch = Counter([d['size']])
    crcCount = Counter([d['crc32'] for d in dupeList])

    for d in dupeList:
        d['crcDupeCount'] = crcCount[d['crc32']]
    
    dupeList.sort(key=lambda x: x['crcDupeCount'], reverse=True)
    #pprint(dupeList)

    #if cnt + 1 == len(dupeList):
    #Counter
    #pprint(filename)
    #break
#exactDupeDict =
    
print(f'done, took {(time.time() - crcStart):.2f}s')

In [22]:
crcCount['9E997366']

4

In [None]:
li =[x for x in range(10)]
for cnt,x in enumerate(li):
	print(cnt,x, len(li))

### let's do some analysis by file suffixes

In [None]:
suffixesCounter = Counter()

for k,dupeList in dupeDict.items(): 
    for file in dupeList:
        #for suffix in file['pathObj'].suffixes:
        suffixesCounter[file['pathObj'].suffix] += 1
suffixesCounter = [(k,v) for k,v in suffixesCounter.items()]
suffixesCounter.sort(key = lambda k: k[1], reverse=True)

#top 20 duplicated 
pprint(suffixesCounter[:20])

### start by figuring out duplication categories
 - same name, same size, same mtime, same hash different dedupeDirs
     - can safely delete one of them based on priority
 - same name, same size, different mtime
     - hash and branch logic if different
 - same name, different size 
     - send to manual review queue, potentially prefer the newer one

In [19]:
if type(dedupeDirs[0]) == str:
    dedupeDirs = [(rootTier,Path(directory).as_posix()) for rootTier,directory in enumerate(dedupeDirs)]

for k,dupeList in dupeDict.items(): 
    for cnt, file in enumerate(dupeList):
        #assign a tier to each identified dupe based on the directory order given in the first user parameters
        for rootTier, dedupeDir in dedupeDirs:
            if file['path'].startswith(dedupeDir):
                file['rootTier'] = rootTier
    dupeList = sorted(dupeList, key=lambda k: (k['rootTier'], len('path')))
    for cnt,file in enumerate(dupeList):
        file['priority'] = cnt 
    pprint(dupeList)

    break
    

[{'mtime': 1607128670.8325891,
  'path': 'C:/Users/Ryan/Desktop/desktop.ini',
  'pathObj': WindowsPath('C:/Users/Ryan/Desktop/desktop.ini'),
  'priority': 0,
  'rootTier': 0,
  'size': 282},
 {'mtime': 1538450652.6097116,
  'path': 'C:/Users/Ryan/Desktop/Files/Home and Money/Tax/2015 Tax/desktop.ini',
  'pathObj': WindowsPath('C:/Users/Ryan/Desktop/Files/Home and Money/Tax/2015 Tax/desktop.ini'),
  'priority': 1,
  'rootTier': 0,
  'size': 136},
 {'mtime': 1577863147.0110621,
  'path': 'C:/Users/Ryan/Desktop/Files/Music/desktop.ini',
  'pathObj': WindowsPath('C:/Users/Ryan/Desktop/Files/Music/desktop.ini'),
  'priority': 2,
  'rootTier': 0,
  'size': 440},
 {'mtime': 1538450652.605711,
  'path': 'C:/Users/Ryan/Desktop/Files/photos/Old Photos/Trivia '
          'Pics/desktop.ini',
  'pathObj': WindowsPath('C:/Users/Ryan/Desktop/Files/photos/Old Photos/Trivia Pics/desktop.ini'),
  'priority': 3,
  'rootTier': 0,
  'size': 136},
 {'mtime': 1616551386.9991388,
  'path': 'C:/Users/Ryan/Desk

### Let's get deleting files
Work out the priority of each file to conslidate everything on one disk first.

Dry run is enabled unless you disable it.

In [3]:
import multiprocess

def double(a):
    return a * 2

def driver_func():
    PROCESSES = 4
    with multiprocess.Pool(PROCESSES) as pool:
        params = [(1, ), (2, ), (3, ), (4, )]
        results = [pool.apply_async(double, p) for p in params]

        for r in results:
            print('\t', r.get())


In [14]:
driver_func()

	 2
	 4
	 6
	 8


In [17]:
from multiprocess import Process, Queue

def f(q):
    q.put('hello world')

q = Queue()
p = Process(target=f, args=[q])
p.start()
print (q.get())
p.join()

hello world


dryRun = True

## TODO:
 - compare by file attributes, basically just size and name
     -  then by hash if same name/size
 - if name/size disagree, put in a conflict csv to review later



Bold 	**bold text**
*italicized text*

Blockquote 	> blockquote
Ordered List 	
1. First item
2. Second item
3. Third item
Unordered List 	
- First item
- Second item
- Third item
Code 	`code`
Horizontal Rule 	---
Link 	
[title](https://www.example.com)
Image 	
![alt text](image.jpg)

### Results 
| Stretch/Untouched | ProbDistribution | Accuracy |
| :- | -: | :-: |
| Stretched | Gaussian | .843

 ```
{
  "firstName": "John",
  "lastName": "Smith",
  "age": 25
}
``` 
footnote 	Here's a sentence with a footnote. [^1]

[^1]: This is the footnote.
Heading ID 
### My Great Heading {#custom-id}
Definition List 	
term
: definition

Task List 	
- [x] Write the press release
- [ ] Update the website

