-
Notifications
You must be signed in to change notification settings - Fork 5
/
dupinator-find_dupes.py
101 lines (93 loc) · 2.65 KB
/
dupinator-find_dupes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#! /usr/bin/python
import os
import sys
import stat
import md5
filesBySize = {}
def walker(arg, dirname, fnames):
d = os.getcwd()
os.chdir(dirname)
try:
fnames.remove('Thumbs')
except ValueError:
pass
for f in fnames:
if not os.path.isfile(f):
continue
size = os.stat(f)[stat.ST_SIZE]
if size < 100:
continue
if filesBySize.has_key(size):
a = filesBySize[size]
else:
a = []
filesBySize[size] = a
a.append(os.path.join(dirname, f))
os.chdir(d)
for x in sys.argv[1:]:
print 'Scanning directory "%s"....' % x
os.path.walk(x, walker, filesBySize)
print 'Finding potential dupes...'
potentialDupes = []
potentialCount = 0
trueType = type(True)
sizes = filesBySize.keys()
sizes.sort()
for k in sizes:
inFiles = filesBySize[k]
outFiles = []
hashes = {}
if len(inFiles) is 1: continue
print 'Testing %d files of size %d...' % (len(inFiles), k)
for fileName in inFiles:
if not os.path.isfile(fileName):
continue
aFile = file(fileName, 'r')
hasher = md5.new(aFile.read(1024))
hashValue = hasher.digest()
if hashes.has_key(hashValue):
x = hashes[hashValue]
if type(x) is not trueType:
outFiles.append(hashes[hashValue])
hashes[hashValue] = True
outFiles.append(fileName)
else:
hashes[hashValue] = fileName
aFile.close()
if len(outFiles):
potentialDupes.append(outFiles)
potentialCount = potentialCount + len(outFiles)
del filesBySize
print 'Found %d sets of potential dupes...' % potentialCount
print 'Scanning for real dupes...'
dupes = []
for aSet in potentialDupes:
outFiles = []
hashes = {}
for fileName in aSet:
print 'Scanning file "%s"...' % fileName
aFile = file(fileName, 'r')
hasher = md5.new()
while True:
r = aFile.read(4096)
if not len(r):
break
hasher.update(r)
aFile.close()
hashValue = hasher.digest()
if hashes.has_key(hashValue):
if not len(outFiles):
outFiles.append(hashes[hashValue])
outFiles.append(fileName)
else:
hashes[hashValue] = fileName
if len(outFiles):
dupes.append(outFiles)
i = 0
for d in dupes:
print 'Original is %s' % d[0]
for f in d[1:]:
i = i + 1
print 'Deleting %s' % f
os.remove(f)
print