# THE GREAT FILE DE-DUPLICATOR
### A solution to my fractured backups
A program of 1000 lines begins with dependency imports


In [2]:
from pathlib import Path
import threading
import json
from collections import defaultdict
from pprint import pprint, pformat
import time
import logging
print('done')

done


### Enter your parameters
Enter where you want to output your analysis files, what directories you want to analyze for duplicates and which folder names you wish to exclude from the analysis.
Mind the backslashes playing nice with python on windows.

**THE ORDER THE deDupeDirs ARE IN ARE THE PREFERRED ORDER FOR RETENTION IF DRY RUN IS DISABLED! MAKE SURE YOU ARE OK WITH LATER ENTRIES IN THE LIST HAVING DUPLICATE FILES DISABLED!**


In [41]:
dedupeDirs = (r'C:\Users\Ryan\Desktop',
               #r'D:\Users\Ryan',
               'G:\\',
               'F:\\',
               #'H:\\',
               )
excludeDirs = {'$RECYCLE.BIN',
               'gdpr data',
               'steamapps',
               'Google Dump',
              }
excludeExtensions = {'.example',    
}
outputDir = Path(r'C:\Users\Ryan\Desktop')

#Assign file retention priority to dedupeDirs based on order
#Early entries in the list will keep their files over later entries with duplicate files
print(dedupeDirs)

('C:\\Users\\Ryan\\Desktop', 'G:\\', 'F:\\')


## Find duplicates based on file name
For a given directory listed in "deDupeDirs", get the filenames, last modified dates, and file size. 
End result of a list will be a dict with filename as keys and values that are lists of dicts of file paths and attributes

In [29]:
dupeSearchStart = time.time()
dedupePaths = [Path(x) for x in dedupeDirs]

nameSet = set()
files = []
dupeDict = defaultdict(list)
for wd in dedupePaths:
    print(f'running in {wd}')
    dirDupeSearchStart = time.time()
    for file in wd.glob('**/*'):
        if file.is_file() and not excludeDirs.intersection(set(file.parts)) and not excludeExtensions.intersection(set(file.suffixes)):
            fileDict = {#'name':file.name,
                        'path':file.as_posix(),
                        'size':file.stat().st_size,
                        'mtime':file.stat().st_mtime,
                        }
            try:
                dupeDict[file.name].append(fileDict)
            except:
                logging.exception(f"couldn't add this filedict, continuing \n {fileDict}")
                raise
    print('len of files parsed = ', len(dupeDict))
    print(f'processed {wd} in {time.time() - dirDupeSearchStart}s')

uniqueDict = {k:v for k,v in dupeDict.items() if len(v) == 1}
print('number of uniques = ', len(uniqueDict))
dupeDict   = {k:v for k,v in dupeDict.items() if len(v) > 1}
print('number of dupes = ', len(dupeDict))
print(f'dict load completed in {time.time() - dupeSearchStart}s')

running in C:\Users\Ryan\Desktop
len of files parsed =  41526
processed C:\Users\Ryan\Desktop in 19.35436749458313s
running in G:\
len of files parsed =  43234
processed G:\ in 5.20017409324646s
running in F:\
len of files parsed =  46681
processed F:\ in 21.063753128051758s
number of uniques =  27750
number of dupes =  18931
dict load completed in 45.64630079269409s


### Save your progress
The above steps could take a while depending on how much you've hoarded, save the output to a JSON file to noodle with at a later date

In [30]:
writingStart = time.time()
print('writing to file')    
with open(outputDir.joinpath('duplicateFiles.json'),'w',encoding='utf8') as f:
    json.dump(dupeDict, f, indent = 1 )
    
with open(outputDir.joinpath('uniqueFiles.json'),'w',encoding='utf8') as f:
    json.dump(uniqueDict, f, indent = 1 )
print(f'done, took {time.time() - writingStart}s')


writing to file
done, took 1.1922683715820312s


### Reload your progress
Reload if you saved to a JSON file.

In [31]:
loadStart = time.time()
print('loading from file')    
with open(outputDir.joinpath('duplicateFiles.json'),'r',encoding='utf8') as f:
    dupeDict = json.load(f)
    
with open(outputDir.joinpath('uniqueFiles.json'),'r',encoding='utf8') as f:
    uniqueDict = json.load(f)
print(f'done, took {time.time() - loadStart}s')

loading from file
done, took 0.2160499095916748s


### start by figuring out duplication categories
 - same name, same size, same mtime, same hash different dedupeDirs
     - can safely delete one of them based on priority
 - same name, same size, different mtime
     - hash and branch logic if different
 - same name, different size 
     - send to manual review queue, potentially prefer the newer one

In [42]:
dedupeDirs = [(cnt,Path(directory).as_posix()) for cnt,directory in enumerate(dedupeDirs)]
for dupeList in dupeDict.values():
    for file in dupeList:
        for priority,directory in dedupeDirs:
            #print(priority,directory)
            if file['path'].startswith(directory):
                file['priority'] = priority
    print(dupeList)
    dupeList = sorted(dupeList, key=lambda k: k['priority']) 
    break
"""             
        deleting = dictValue[0]
        saving = dictValue[1]
        print(deleting)
        print(saving)
        if dryRun:
            print(f"dry run: would have deleted {deleting.name} as it's older than {saving.name}")
        else:
            print(f"deleting {deleting['path'].name} as it's older than {saving['path'].name}")
            #dictValue[0]['path'].unlink()
"""

[{'path': 'C:/Users/Ryan/Desktop/desktop.ini', 'size': 282, 'mtime': 1607128670.8325891, 'priority': 0}, {'path': 'C:/Users/Ryan/Desktop/Files/Home and Money/Tax/2015 Tax/desktop.ini', 'size': 136, 'mtime': 1538450652.6097116, 'priority': 0}, {'path': 'C:/Users/Ryan/Desktop/Files/Music/desktop.ini', 'size': 440, 'mtime': 1577863147.0110621, 'priority': 0}, {'path': 'C:/Users/Ryan/Desktop/Files/photos/Old Photos/Trivia Pics/desktop.ini', 'size': 136, 'mtime': 1538450652.605711, 'priority': 0}, {'path': 'C:/Users/Ryan/Desktop/Files/Work/work git/SQL/desktop.ini', 'size': 110, 'mtime': 1616551386.9991388, 'priority': 0}, {'path': 'G:/Home and Money/Tax/2015 Tax/desktop.ini', 'size': 136, 'mtime': 1538450652.6097116, 'priority': 1}, {'path': 'G:/Music/desktop.ini', 'size': 440, 'mtime': 1587357803.6098547, 'priority': 1}, {'path': 'G:/photos/Old Photos/Trivia Pics/desktop.ini', 'size': 136, 'mtime': 1538450652.605711, 'priority': 1}, {'path': 'F:/Files/Home and Money/Tax/2015 Tax/desktop.i

'             \n        deleting = dictValue[0]\n        saving = dictValue[1]\n        print(deleting)\n        print(saving)\n        if dryRun:\n            print(f"dry run: would have deleted {deleting.name} as it\'s older than {saving.name}")\n        else:\n            print(f"deleting {deleting[\'path\'].name} as it\'s older than {saving[\'path\'].name}")\n            #dictValue[0][\'path\'].unlink()\n'

### Let's get deleting files
Work out the priority of each file to conslidate everything on one disk first.

Dry run is enabled unless you disable it.

dryRun = True

## TODO:
 - compare by file attributes, basically just size and name
     -  then by hash if same name/size
 - if name/size disagree, put in a conflict csv to review later



Bold 	**bold text**
*italicized text*

Blockquote 	> blockquote
Ordered List 	
1. First item
2. Second item
3. Third item
Unordered List 	
- First item
- Second item
- Third item
Code 	`code`
Horizontal Rule 	---
Link 	
[title](https://www.example.com)
Image 	
![alt text](image.jpg)

### Results 
| Stretch/Untouched | ProbDistribution | Accuracy |
| :- | -: | :-: |
| Stretched | Gaussian | .843

 ```
{
  "firstName": "John",
  "lastName": "Smith",
  "age": 25
}
``` 
footnote 	Here's a sentence with a footnote. [^1]

[^1]: This is the footnote.
Heading ID 
### My Great Heading {#custom-id}
Definition List 	
term
: definition

Task List 	
- [x] Write the press release
- [ ] Update the website

