# THE GREAT FILE DE-DUPLICATOR
### A solution to my fractured backups
A program of 1000 lines begins with dependency imports


In [2]:
from pathlib import Path
import threading
import json
from collections import defaultdict
from pprint import pprint, pformat
import time
import logging
import pandas as pd

print('done')

done


### Enter your parameters
Enter where you want to output your analysis files, what directories you want to analyze for duplicates and which folder names you wish to exclude from the analysis.
Mind the backslashes playing nice with python on windows.

**THE ORDER THE deDupeDirs ARE IN ARE THE PREFERRED ORDER FOR RETENTION IF DRY RUN IS DISABLED! MAKE SURE YOU ARE OK WITH LATER ENTRIES IN THE LIST HAVING DUPLICATE FILES DISABLED!**


In [3]:
dedupeDirs = (r'C:\Users\Ryan\Desktop',
               #r'D:\Users\Ryan',
               'G:\\',
               'F:\\',
               #'H:\\',
               )
excludeDirs = {'$RECYCLE.BIN',
               'gdpr data',
               'steamapps',
               'Google Dump',
              }
excludeExtensions = {'.example',    
}
outputDir = Path(r'C:\Users\Ryan\Desktop')

#Assign file retention priority to dedupeDirs based on order
#Early entries in the list will keep their files over later entries with duplicate files
print(dedupeDirs)

('C:\\Users\\Ryan\\Desktop', 'G:\\', 'F:\\')


## Find duplicates based on file name
For a given directory listed in "deDupeDirs", get the filenames, last modified dates, and file size. 
End result of a list will be a dict with filename as keys and values that are lists of dicts of file paths and attributes

In [None]:
dupeSearchStart = time.time()
dedupePaths = [Path(x) for x in dedupeDirs]

nameSet = set()
files = []
dupeDict = defaultdict(list)
for wd in dedupePaths:
    print(f'running in {wd}')
    dirDupeSearchStart = time.time()
    for file in wd.glob('**/*'):
        if file.is_file() and not excludeDirs.intersection(set(file.parts)) and not excludeExtensions.intersection(set(file.suffixes)):
            fileDict = {'PathObj':file,
                        'path':file.as_posix(),
                        'size':file.stat().st_size,
                        'mtime':file.stat().st_mtime,
                        }
            try:
                dupeDict[file.name].append(fileDict)
            except:
                logging.exception(f"couldn't add this filedict, continuing \n {fileDict}")
                raise
    print('len of files parsed = ', len(dupeDict))
    print(f'processed {wd} in {time.time() - dirDupeSearchStart}s')

uniqueDict = {k:v for k,v in dupeDict.items() if len(v) == 1}
print('number of uniques = ', len(uniqueDict))
dupeDict   = {k:v for k,v in dupeDict.items() if len(v) > 1}
print('number of dupes = ', len(dupeDict))
print(f'dict load completed in {time.time() - dupeSearchStart}s')

running in C:\Users\Ryan\Desktop


### Save your progress
The above steps could take a while depending on how much you've hoarded, save the output to a JSON file to noodle with at a later date

In [8]:
writingStart = time.time()
print('writing to file')    
with open(outputDir.joinpath('duplicateFiles.json'),'w',encoding='utf8') as f:
    json.dump(dupeDict, f, indent = 1 )
    
with open(outputDir.joinpath('uniqueFiles.json'),'w',encoding='utf8') as f:
    json.dump(uniqueDict, f, indent = 1 )
print(f'done, took {time.time() - writingStart}s')


writing to file
done, took 2.026458501815796s


### Reload your progress
Reload if you saved to a JSON file.  You'll need to run imports and the parameter code blocks before this.

In [62]:
loadStart = time.time()
print('loading from file')    
with open(outputDir.joinpath('duplicateFiles.json'),'r',encoding='utf8') as f:
    dupeDict = json.load(f)
    
with open(outputDir.joinpath('uniqueFiles.json'),'r',encoding='utf8') as f:
    uniqueDict = json.load(f)
print(f'done, took {time.time() - loadStart}s')

loading from file
done, took 0.21904969215393066s


### start by figuring out duplication categories
 - same name, same size, same mtime, same hash different dedupeDirs
     - can safely delete one of them based on priority
 - same name, same size, different mtime
     - hash and branch logic if different
 - same name, different size 
     - send to manual review queue, potentially prefer the newer one

In [72]:
if type(dedupeDirs[0]) == str:
    dedupeDirs = [(rootTier,Path(directory).as_posix()) for rootTier,directory in enumerate(dedupeDirs)]

for k,dupeList in dupeDict.items(): 
    for cnt, file in enumerate(dupeList):
        #assign a tier to each identified dupe based on the directory order given in the first user parameters
        for rootTier, dedupeDir in dedupeDirs:
            if file['path'].startswith(dedupeDir):
                file['rootTier'] = rootTier
    dupeList = sorted(dupeList, key=lambda k: (k['rootTier'], len('path')))
    for cnt,file in enumerate(dupeList):
        file['priority'] = cnt 
    pprint(dupeList)

    break
    

[{'mtime': 1607128670.8325891,
  'path': 'C:/Users/Ryan/Desktop/desktop.ini',
  'priority': 0,
  'rootTier': 0,
  'size': 282},
 {'mtime': 1538450652.6097116,
  'path': 'C:/Users/Ryan/Desktop/Files/Home and Money/Tax/2015 Tax/desktop.ini',
  'priority': 1,
  'rootTier': 0,
  'size': 136},
 {'mtime': 1577863147.0110621,
  'path': 'C:/Users/Ryan/Desktop/Files/Music/desktop.ini',
  'priority': 2,
  'rootTier': 0,
  'size': 440},
 {'mtime': 1538450652.605711,
  'path': 'C:/Users/Ryan/Desktop/Files/photos/Old Photos/Trivia '
          'Pics/desktop.ini',
  'priority': 3,
  'rootTier': 0,
  'size': 136},
 {'mtime': 1616551386.9991388,
  'path': 'C:/Users/Ryan/Desktop/Files/Work/work git/SQL/desktop.ini',
  'priority': 4,
  'rootTier': 0,
  'size': 110},
 {'mtime': 1538450652.6097116,
  'path': 'G:/Home and Money/Tax/2015 Tax/desktop.ini',
  'priority': 5,
  'rootTier': 1,
  'size': 136},
 {'mtime': 1587357803.6098547,
  'path': 'G:/Music/desktop.ini',
  'priority': 6,
  'rootTier': 1,
  'siz

In [37]:
dupeList = sorted(dupeList, key=lambda k: (k['rootTier'], len('path')))
for cnt, file in enumerate(dupeList):
    file['priority'] = cnt 


### Let's get deleting files
Work out the priority of each file to conslidate everything on one disk first.

Dry run is enabled unless you disable it.

dryRun = True

## TODO:
 - compare by file attributes, basically just size and name
     -  then by hash if same name/size
 - if name/size disagree, put in a conflict csv to review later



Bold 	**bold text**
*italicized text*

Blockquote 	> blockquote
Ordered List 	
1. First item
2. Second item
3. Third item
Unordered List 	
- First item
- Second item
- Third item
Code 	`code`
Horizontal Rule 	---
Link 	
[title](https://www.example.com)
Image 	
![alt text](image.jpg)

### Results 
| Stretch/Untouched | ProbDistribution | Accuracy |
| :- | -: | :-: |
| Stretched | Gaussian | .843

 ```
{
  "firstName": "John",
  "lastName": "Smith",
  "age": 25
}
``` 
footnote 	Here's a sentence with a footnote. [^1]

[^1]: This is the footnote.
Heading ID 
### My Great Heading {#custom-id}
Definition List 	
term
: definition

Task List 	
- [x] Write the press release
- [ ] Update the website

