Skip to content

Commit

Permalink
[backend] be more clever in bs_mergechanges
Browse files Browse the repository at this point in the history
Now does something like a 3-way merge.
  • Loading branch information
mlschroe committed Jun 4, 2018
1 parent 32ebf4f commit 918c361
Showing 1 changed file with 70 additions and 35 deletions.
105 changes: 70 additions & 35 deletions src/backend/bs_mergechanges
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ BEGIN {
unshift @INC, "$wd";
}

use strict;

use POSIX;
use Data::Dumper;
use Getopt::Long;
Expand All @@ -44,15 +46,16 @@ merge tool is reading the entire files and sorts the entries according
to it's date. If entries with same date do differ it fails. The classic
diff3 merge and manual conflict resolution is the only way then.
The tool takes any amount of files and is merging them into one
The tool takes any amount of files and is merging them into one. The first file
must be the common ancestor of the other files.
";
}

#
# Argument parsing
#
if ( @ARGV < 1 ){
if (@ARGV < 1) {
echo_help();
exit(1);
}
Expand All @@ -79,7 +82,7 @@ while (@ARGV) {
}
}

die("Give at least one file") if ( @files < 1 );
die("Give at least one file") if @files < 1;

# init
my $seperator = "-------------------------------------------------------------------";
Expand All @@ -94,21 +97,55 @@ sub time2mystr {
return strftime("%a %b %e %H:%M:%S UTC %Y", @lt);
}

sub findsim {
my ($ent, $allents) = @_;
return undef unless $ent->{'fileno'};
my %sim;
my @in = grep {!$_->{'fileno'}} @$allents;
for my $ent2 (@in) {
return $ent2 if $ent2->{'text'} eq $ent->{'text'};
}
for my $ent2 (@in) {
my @w = split(' ', $ent2->{'text'});
my @wc = grep {$ent->{'text'} =~ /\Q$_\E/} @w;
$sim{$ent2} = @wc ? @w / @wc : 0;
}
my @sorted = sort {$sim{$a} <=> $sim{$b}} @in;
return $sorted[-1] if @sorted && $sorted[-1] > .75;
return undef;
}

sub decide {
my ($ent, $nfiles) = @_;
return $ent unless $ent->{'sim'};
my @sim = @{$ent->{'sim'}};
my @changed = grep {$_->{'text'} ne $ent->{'text'}} @sim;
if (!@changed) {
return $sim[-1] if @sim == $nfiles - 1;
return undef if @sim == $nfiles - 2;
}
if (@changed && @sim == $nfiles - 1) {
return $changed[-1] unless grep {$_->{'text'} ne $changed[-1]->{'text'}} @changed;
}
die("Conflicting entries for $ent->{time}\n");
}

sub setentry {
my ($time, $timestr, $email, $text) = @_;
if ($entries->{$time} && !$force) {
# entry exists already, is it the same?
if ($entries->{$time}->{text} ne $text) {
die("Two different entries for ".time2mystr($time));
my ($time, $timestr, $email, $text, $fileno) = @_;
my $ent = {'time' => $timestr, 'email' => $email, 'text' => $text, 'fileno' => $fileno};
if ($entries{$time}) {
my $siment = findsim($ent, $entries{$time});
if ($siment) {
push @{$siment->{'sim'}}, $ent;
return;
}
} else {
$entries->{$time}->{time} = $timestr;
$entries->{$time}->{email} = $email;
$entries->{$time}->{text} = $text;
}
push @{$entries{$time}}, $ent;
}

# read all files into a hash
my $fileno = 0;
my $nfiles = @files;
while (@files) {
my $file = shift @files;
local *F;
Expand All @@ -125,24 +162,17 @@ while (@files) {
my $cycle = 0;
foreach my $line (@lines) {
chomp($line);
unless($init) {
if($line eq $seperator) {
$init = 1;
} else {
die("no ---- seperator in first line")
}
if (!$init) {
die("no ---- seperator in first line\n") unless $line eq $seperator;
$init = 1;
next;
}

$cycle = $cycle + 1;
# old and upstream new entries where identical. So let our new version win.
$force = 1 if $cycle > 2;

unless($time) {
if (!$time) {
($timestr, $email) = split(' - ', $line, 2);
$time = str2time($timestr);
die("unable to parse time $line") unless $time;
die("unable to find email in time line") unless $email;
die("unable to parse time $line\n") unless $time;
die("unable to find email in $line\n") unless $email;
print "Read ".time2mystr($time)."($time) for $line\n" if $verbose;
next;
}
Expand All @@ -153,8 +183,8 @@ while (@files) {
# ignoring the hour due to timezone issues, but do not accept it anymore for current entries
# we take this as one blob.
# Accept this only for entries in 2006 and before with 00 minutes and 00 seconds
if ( $lt[5] > 106 || $lt[1] != 0 || $lt[0] != 0) {
setentry($time, $timestr, $email, $text);
if ($lt[5] > 106 || $lt[1] != 0 || $lt[0] != 0) {
setentry($time, $timestr, $email, $text, $fileno);
$text = "";
$time = undef;
$email = undef;
Expand All @@ -163,19 +193,24 @@ while (@files) {
}

# must be text
$text .= $line;
$text .= "\n";
};
$text .= "$line\n";
}
# last entry
setentry($time, $timestr, $email, $text);
setentry($time, $timestr, $email, $text, $fileno);
$fileno++;
}

print "Merged ouput:\n===========\n" if $verbose;

# output the hash
for my $time (sort{$b <=> $a} keys %$entries) {
print $seperator."\n";
print $entries->{$time}->{time}." - ".$entries->{$time}->{email}."\n";
print $entries->{$time}->{text};
for my $time (sort {$b <=> $a} keys %entries) {
my %seen;
for my $ent (@{$entries{$time}}) {
$ent = decide($ent, $nfiles);
next unless $ent && $ent->{'fileno'}; # ignore old stuff
next if $seen{$ent->{text}};
print "$seperator\n$ent->{time} - $ent->{email}\n$ent->{text}";
$seen{$ent->{text}} = 1;
}
}

0 comments on commit 918c361

Please sign in to comment.