-
Notifications
You must be signed in to change notification settings - Fork 8
/
treecomp.py
210 lines (175 loc) · 8.81 KB
/
treecomp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# -*- encoding: utf-8 -*-
# Copyright 2010 Mats Ekberg
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from builtins import object
from common import invert_dict
class TreeComparer(object):
def __init__(self, basetree, newtree):
""" A tree is defined by a dict on the form
{filename: fingerprint} """
assert type(basetree) == dict
assert type(newtree) == dict
self.basetree = basetree
self.newtree = newtree
self.__compare()
def __compare(self):
"""
Possible cases:
basetree newtree
=================== ===================
deleted.txt del --
modified.txt mod1 -- modified.txt mod2
unchanged.txt unch -- unchanged.txt unch
-- added.txt add
old_name.txt renm --
-- new_name.txt renm
Note that both sides can have additional files with hash "renm",
not shown in the table above, in any combination of file names.
"""
# Sets of filenames except for renamed_files, which keeps (old_name, new_name) tuples.
self.deleted_files = set()
self.modified_files = set()
self.unchanged_files = set()
self.added_files = set()
self.renamed_files = set()
newtree_by_hash = invert_dict(self.newtree)
newtree_covered = set() # We need this info, but don't want to mutate newtree.
# This loop covers all possible cases except files that are truly new
# in newtree (moves/renames are covered).
for base_fn, base_hash in self.basetree.items():
if base_fn in self.newtree:
if base_hash == self.newtree[base_fn]:
self.unchanged_files.add(base_fn)
else:
self.modified_files.add(base_fn)
newtree_covered.add(base_fn)
else:
if base_hash in newtree_by_hash:
for new_fn in newtree_by_hash[base_hash]:
# new_fn must be a truly new file, i.e. have no counterpart
# in basetree, and not been used as a rename target before.
if new_fn not in self.basetree and new_fn not in newtree_covered:
self.renamed_files.add((base_fn, new_fn))
newtree_covered.add(new_fn)
break
else:
# No suitable rename target found above, base_fn is truly deleted.
self.deleted_files.add(base_fn)
else:
self.deleted_files.add(base_fn)
# Any files in newtree not covered above must be truly new files
# with no counterparts in basetree at all.
for new_fn, new_hash in self.newtree.items():
if new_fn not in newtree_covered:
self.added_files.add(new_fn)
# Alternatively implementing the second loop immediately above
# symmetrically to the first loop is almost possible, but is made very
# difficult because it also had to reconstruct the rename pairs
# *exactly* as the first loop. ==> it is much better and easier as done above.
# for new_fn, new_hash in self.newtree.iteritems():
# if new_fn not in self.basetree:
# if new_hash in basetree_by_hash:
# for base_fn in basetree_by_hash[new_hash]:
# # ...
# if base_fn not in newtree and base_fn not in xxxx:
# # ...
# pass
# else:
# self.added_files.add(new_fn)
assert(len(self.modified_files) + len(self.unchanged_files) +
len(self.renamed_files) + len(self.deleted_files) == len(self.basetree))
assert(len(self.modified_files) + len(self.unchanged_files) +
len(self.renamed_files) + len(self.added_files) == len(self.newtree))
def __get_rename_deleted(self):
""" Returns the filenames in the basetree that have been "deleted by rename". """
ren_del_files = set()
for old_name, new_name in self.renamed_files:
ren_del_files.add(old_name)
return ren_del_files
def __get_rename_added(self):
""" Returns the filenames in the newtree that have been "added by rename". """
ren_add_files = set()
for old_name, new_name in self.renamed_files:
ren_add_files.add(new_name)
return ren_add_files
def as_sets(self):
# This method might as well be removed (or the self.*_files be declared "private")...
return self.unchanged_files, self.added_files, self.modified_files, self.deleted_files, self.renamed_files
def all_filenames(self):
return set(self.basetree.keys()).union(set(self.newtree.keys()))
def all_changed_filenames(self):
all_changed = self.added_files.union(self.__get_rename_added(),
self.modified_files, self.deleted_files, self.__get_rename_deleted())
assert(all_changed == self.all_filenames().difference(self.unchanged_files)) # Performance impact?
return all_changed
def is_deleted(self, filename):
return filename in self.deleted_files or filename in self.__get_rename_deleted()
def is_modified(self, filename):
return filename in self.modified_files
def is_new(self, filename):
return filename in self.added_files or filename in self.__get_rename_added()
def is_unchanged(self, filename):
return filename in self.unchanged_files
def __selftest():
# The rename_dup.txt files must never be used as counterparts for the
# old_name.txt or new_name.txt files, even though their contents is the same.
oldlist = {"deleted.txt": "deleted content",
"modified.txt": "modified content1",
"unchanged.txt": "unchanged content",
"old_name.txt": "renamed content",
"rename_dup.txt": "renamed content",
}
newlist = {"modified.txt": "modified content2",
"unchanged.txt": "unchanged content",
"added.txt": "new content",
"new_name.txt": "renamed content",
"rename_dup.txt": "renamed content",
}
comp = TreeComparer(oldlist, newlist)
assert comp.deleted_files == set(["deleted.txt"]), comp.deleted_files
assert comp.unchanged_files == set(["unchanged.txt", "rename_dup.txt"]), comp.unchanged_files
assert comp.added_files == set(["added.txt"]), comp.added_files
assert comp.modified_files == set(["modified.txt"]), comp.modified_files
assert comp.renamed_files == set([("old_name.txt", "new_name.txt")]), comp.renamed_files
assert comp.all_filenames() == set(["deleted.txt", "modified.txt", "unchanged.txt", "added.txt", "rename_dup.txt", "old_name.txt", "new_name.txt"])
assert comp.all_changed_filenames() == set(["deleted.txt", "modified.txt", "added.txt", "old_name.txt", "new_name.txt"])
assert comp.is_modified("modified.txt")
assert not comp.is_deleted("modified.txt")
assert not comp.is_unchanged("modified.txt")
assert not comp.is_new("modified.txt")
assert not comp.is_modified("deleted.txt")
assert comp.is_deleted("deleted.txt")
assert not comp.is_unchanged("deleted.txt")
assert not comp.is_new("deleted.txt")
assert not comp.is_modified("unchanged.txt")
assert not comp.is_deleted("unchanged.txt")
assert comp.is_unchanged("unchanged.txt")
assert not comp.is_new("unchanged.txt")
assert not comp.is_modified("added.txt")
assert not comp.is_deleted("added.txt")
assert not comp.is_unchanged("added.txt")
assert comp.is_new("added.txt")
assert not comp.is_modified("rename_dup.txt")
assert not comp.is_deleted("rename_dup.txt")
assert comp.is_unchanged("rename_dup.txt")
assert not comp.is_new("rename_dup.txt")
assert not comp.is_modified("old_name.txt")
assert comp.is_deleted("old_name.txt")
assert not comp.is_unchanged("old_name.txt")
assert not comp.is_new("old_name.txt")
assert not comp.is_modified("new_name.txt")
assert not comp.is_deleted("new_name.txt")
assert not comp.is_unchanged("new_name.txt")
assert comp.is_new("new_name.txt")
__selftest()