# THE GREAT FILE DE-DUPLICATOR
### A solution to my fractured backups
A program of 1000 lines begins with dependency imports


In [1]:
from pathlib import Path
from multiprocess import Pool, cpu_count
from pprint import pprint, pformat
import pandas as pd
from IPython.display import clear_output, display, HTML
import sys
import time

print('done')

done


### Enter your parameters
Enter where you want to output your analysis files, what directories you want to analyze for duplicates and which folder names you wish to exclude from the analysis.
Mind the backslashes playing nice with python on windows.

**THE ORDER THE deDupeDirs ARE IN ARE THE PREFERRED ORDER FOR RETENTION IF DRY RUN IS DISABLED! MAKE SURE YOU ARE OK WITH LATER ENTRIES IN THE LIST HAVING DUPLICATE FILES DISABLED!**


In [2]:
dedupeDirs = (r'C:\Users\Ryan\Desktop',
               r'D:\Users\Ryan',
               'G:',
               'F:',
               #'H:\\',
               )
excludeDirs = {'$RECYCLE.BIN',
               'gdpr data',
               'steamapps',
               'Google Dump',
               'Windows',
              }
excludeExtensions = {'.example',    
}

outputDir = Path(r'C:\Users\Ryan\Desktop')

#Assign file retention priority to dedupeDirs based on order
#Early entries in the list will keep their files over later entries with duplicate files
print(dedupeDirs)

('C:\\Users\\Ryan\\Desktop', 'D:\\Users\\Ryan', 'G:', 'F:')


## Index all files in directories
For a directories listed in "deDupeDirs", get the filenames, last modified dates, and file size. 
End result is a dataframe with all files listed within those dirs.

In [3]:
%%time
dedupePaths = [Path(x) for x in dedupeDirs]

files = []
for workDir in dedupePaths:
    errors = []
    print(f'running in {workDir}')
    for file in workDir.glob('**/*'):
        try:
            if (   file.is_file() 
                    and not excludeDirs.intersection(set(file.parts)) 
                    and not excludeExtensions.intersection(set(file.suffixes))
               ):
                fileDict = {'path':file,
                            #'path':file.as_posix(),
                            'name':file.name,
                            'size':file.stat().st_size,
                            'mtime':file.stat().st_mtime,
                           }
                files.append(fileDict)
        except:
                errors.append(file)
    print(f'{len(files)=}')
    print(f'{len(errors)=}')

print('loading dataframe')
df = pd.DataFrame(files)
print('dupeDataframe loaded')

running in C:\Users\Ryan\Desktop
len(files)=56923
len(errors)=0
running in D:\Users\Ryan
len(files)=208187
len(errors)=9
running in G:
len(files)=231981
len(errors)=0
running in F:
len(files)=252510
len(errors)=0
loading dataframe
dupeDataframe loaded
Wall time: 3min 5s


### Save your progress
The above steps could take a while depending on how much you've hoarded, save the output to a JSON file to noodle with at a later date

In [4]:
%%time
df.to_pickle(r'C:\Users\Ryan\Desktop\filelist.pkl')
df.to_csv('filelist.csv', index = False)
print('pickled and csved df')

pickled and csved df
Wall time: 1.94 s


### Reload your progress
Reload if you saved to a JSON file.  You'll need to run imports and the parameter code blocks before this.

In [None]:
%%time
df = pd.read_pickle(r'C:\Users\Ryan\Desktop\filelist.pkl')                       
print('loaded from file')    

### Define functions used in duplicate processing

In [5]:
def mpCrc32(fileName):
    #have to import within function because of multiprocessing
    from zlib import crc32
    try:
        with open(fileName, 'rb') as fh:
            hash = 0
            while True:
                s = fh.read(65536)
                if not s:
                    break
                hash = crc32(s, hash)
            return "%08X" % (hash & 0xFFFFFFFF)
    except Exception as e:
        x = f'{e}, {fileName}'
        return x       

print('done')

done


In [None]:
%%time
#shuffle your dataframe to pull semi randomly and so we don't linearly iterate over folders
df = df.sample(frac = 1)

crcList = []
with Pool(processes = int(cpu_count()/2), maxtasksperchild = 500) as p:
    for cnt, crc in enumerate(p.imap(func = mpCrc32, iterable = df['path'], chunksize = 5000)):
        crcList.append(crc)
        sys.stderr.write(f'\r{cnt/len(df):.0%}')
df['crc'] = crcList
df

2%

## Checkpoint your progress  after calculating CRCs
It takes a while to get this far since it's basically a full disk read, save your progress!

In [None]:
%%time
df.to_pickle(r'C:\Users\Ryan\Desktop\filelistCRC.pkl')
df.to_csv('filelistCRC.csv', index = False)
print('pickled CRC populated dataframe and csved df')

### Reload progress 

In [None]:
%%time
df = pd.read_pickle(r'C:\Users\Ryan\Desktop\filelistCRC.pkl')                       
print('loaded from file')
print(f'{len(df)=}')

In [None]:
%%time
dedupeTiers = [(rootTier,Path(directory).as_posix()) for rootTier,directory in enumerate(dedupeDirs)]
print(dedupeTiers)
def getTier(file):
    file = file.as_posix()
    return int([rootTier for rootTier, posix_path in dedupeTiers if file.startswith(posix_path)][0])
    
print(df['path'][0].as_posix())
print(getTier(df['path'][0]))
df['rootTier'] = df['path'].apply(getTier)

#df['rootTier'] = [getTier(x) for x in df['path']]
df.dtypes
#df['path'].map()
#df['sourceTier'] = 

In [None]:
%%time
#sort df before the group by
df['pathLen'] = df['path'].map(Path.as_posix).map(len)
df.sort_values(by= ['name','rootTier', 'pathLen'],  inplace = True)
del df['pathLen']
df

In [None]:
%%time

#let's try to find the most common folders stuff is in
df['parent'] = [x.parent for x in df['path']]

In [None]:
df.count()

### Filter dataframe down to duplicate CRCs only

In [None]:
%%time
grouped = df.groupby('crc')
#grouped.filter('')
grouped = grouped.count().sort_values('path', ascending = False)
grouped = grouped[grouped['path']>1]
duplicatedCRCs = list(grouped.index)
df = df[df['crc'].isin(duplicatedCRCs)]
grouped
#grouped.index

### Only run on exact duplicates first

In [None]:
#%time
#my group by root tier logic seems to be working 
grouped = df.groupby('crc', sort = False)

for cnt, (crc, group) in enumerate(grouped):
    if len(group) == 1 or crc == '00000000':
        continue
    if group['rootTier'].min() == group['rootTier'].max():
        continue
    clear_output()
    display(group[group['rootTier'] == group['rootTier'].max()])
    display(group)
    time.sleep(4)


In [None]:
dupeList = sorted(dupeList, key=lambda k: (k['rootTier'], len('path')))
for cnt,file in enumerate(dupeList):
    file['priority'] = cnt 
pprint(dupeList)



### let's do some analysis by file suffixes

In [None]:
suffixesCounter = Counter()

for k,dupeList in dupeDict.items(): 
    for file in dupeList:
        #for suffix in file['pathObj'].suffixes:
        suffixesCounter[file['pathObj'].suffix] += 1
suffixesCounter = [(k,v) for k,v in suffixesCounter.items()]
suffixesCounter.sort(key = lambda k: k[1], reverse=True)

#top 20 duplicated 
pprint(suffixesCounter[:20])

### start by figuring out duplication categories
 - same name, same size, same mtime, same hash different dedupeDirs
     - can safely delete one of them based on priority
 - same name, same size, different mtime
     - hash and branch logic if different
 - same name, different size 
     - send to manual review queue, potentially prefer the newer one

In [1]:
For my deletion logic I need the dry run I need to output a file that would say here's the original path and here's the moved to path which is basically the recycle bin and then I need to be able to restore files to and from that recycle bin before permanent deletion

Make sure this code to restore stuff from the recycle bin as well tested so that I don't break any processes I can't remember

All unique files on my non-primary drive need to be pulled over and brought into the formal file structure

Look into whatever line is tech tips use that was basically the r seeing log rotate thing where I can automate my backups I need to have snapshots as well as like I don't schedule to

SyntaxError: invalid syntax (<ipython-input-1-11aecd33e5a8>, line 1)

### Let's get deleting files
Work out the priority of each file to conslidate everything on one disk first.

Dry run is enabled unless you disable it.

In [None]:
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}" 
        num /= 1024.0
    return "{num:.1f}Yi{suffix}"


In [None]:
from IPython.display import display, HTML

# Assuming that dataframes df1 and df2 are already defined:

#display(df)
display(df.to_html())


dryRun = True

## TODO:
 - compare by file attributes, basically just size and name
     -  then by hash if same name/size
 - if name/size disagree, put in a conflict csv to review later



Bold 	**bold text**
*italicized text*

Blockquote 	> blockquote
Ordered List 	
1. First item
2. Second item
3. Third item
Unordered List 	
- First item
- Second item
- Third item
Code 	`code`
Horizontal Rule 	---
Link 	
[title](https://www.example.com)
Image 	
![alt text](image.jpg)

### Results 
| Stretch/Untouched | ProbDistribution | Accuracy |
| :- | -: | :-: |
| Stretched | Gaussian | .843

 ```
{
  "firstName": "John",
  "lastName": "Smith",
  "age": 25
}
``` 
footnote 	Here's a sentence with a footnote. [^1]

[^1]: This is the footnote.
Heading ID 
### My Great Heading {#custom-id}
Definition List 	
term
: definition

Task List 	
- [x] Write the press release
- [ ] Update the website

