[backend] be more clever in bs_mergechanges

Now does something like a 3-way merge.
openSUSE · Jun 4, 2018 · 918c361 · 918c361
1 parent 32ebf4f
commit 918c361
Showing 1 changed file with 70 additions and 35 deletions.
diff --git a/src/backend/bs_mergechanges b/src/backend/bs_mergechanges
@@ -25,6 +25,8 @@ BEGIN {
   unshift @INC,  "$wd";
 }
 
+use strict;
+
 use POSIX;
 use Data::Dumper;
 use Getopt::Long;
@@ -44,15 +46,16 @@ merge tool is reading the entire files and sorts the entries according
 to it's date. If entries with same date do differ it fails. The classic
 diff3 merge and manual conflict resolution is the only way then.
 
-The tool takes any amount of files and is merging them into one
+The tool takes any amount of files and is merging them into one. The first file
+must be the common ancestor of the other files.
 
 ";
 }
 
 #
 # Argument parsing
 #
-if ( @ARGV < 1 ){
+if (@ARGV < 1) {
   echo_help();
   exit(1);
 }
@@ -79,7 +82,7 @@ while (@ARGV) {
   }
 }
 
-die("Give at least one file") if ( @files < 1 );
+die("Give at least one file") if @files < 1;
 
 # init
 my $seperator = "-------------------------------------------------------------------";
@@ -94,21 +97,55 @@ sub time2mystr {
   return strftime("%a %b %e %H:%M:%S UTC %Y", @lt);
 }
 
+sub findsim {
+  my ($ent, $allents) = @_;
+  return undef unless $ent->{'fileno'};
+  my %sim;
+  my @in = grep {!$_->{'fileno'}} @$allents;
+  for my $ent2 (@in) {
+    return $ent2 if $ent2->{'text'} eq $ent->{'text'};
+  }
+  for my $ent2 (@in) {
+    my @w = split(' ', $ent2->{'text'});
+    my @wc = grep {$ent->{'text'} =~ /\Q$_\E/} @w;
+    $sim{$ent2} = @wc ? @w / @wc : 0;
+  }
+  my @sorted = sort {$sim{$a} <=> $sim{$b}} @in;
+  return $sorted[-1] if @sorted && $sorted[-1] > .75;
+  return undef;
+}
+
+sub decide {
+  my ($ent, $nfiles) = @_;
+  return $ent unless $ent->{'sim'};
+  my @sim = @{$ent->{'sim'}};
+  my @changed = grep {$_->{'text'} ne $ent->{'text'}} @sim;
+  if (!@changed) {
+    return $sim[-1] if @sim == $nfiles - 1;
+    return undef if @sim == $nfiles - 2;
+  }
+  if (@changed && @sim == $nfiles - 1) {
+    return $changed[-1] unless grep {$_->{'text'} ne $changed[-1]->{'text'}} @changed;
+  }
+  die("Conflicting entries for $ent->{time}\n");
+}
+
 sub setentry {
-  my ($time, $timestr, $email, $text) = @_;
-  if ($entries->{$time} && !$force) {
-    # entry exists already, is it the same?
-    if ($entries->{$time}->{text} ne $text) {
-      die("Two different entries for ".time2mystr($time));
+  my ($time, $timestr, $email, $text, $fileno) = @_;
+  my $ent = {'time' => $timestr, 'email' => $email, 'text' => $text, 'fileno' => $fileno};
+  if ($entries{$time}) {
+    my $siment = findsim($ent, $entries{$time});
+    if ($siment) {
+      push @{$siment->{'sim'}}, $ent;
+      return;
     }
-  } else {
-    $entries->{$time}->{time} = $timestr;
-    $entries->{$time}->{email} = $email;
-    $entries->{$time}->{text} = $text;
   }
+  push @{$entries{$time}}, $ent;
 }
 
 # read all files into a hash
+my $fileno = 0;
+my $nfiles = @files;
 while (@files) {
   my $file = shift @files;
   local *F;
@@ -125,24 +162,17 @@ while (@files) {
   my $cycle = 0;
   foreach my $line (@lines) {
     chomp($line);
-    unless($init) {
-      if($line eq $seperator) {
-        $init = 1;
-      } else {
-        die("no ---- seperator in first line")
-      }
+    if (!$init) {
+      die("no ---- seperator in first line\n") unless $line eq $seperator;
+      $init = 1;
       next;
     }
 
-    $cycle = $cycle + 1;
-    # old and upstream new entries where identical. So let our new version win.
-    $force = 1 if $cycle > 2;
-
-    unless($time) {
+    if (!$time) {
       ($timestr, $email) = split(' - ', $line, 2);
       $time = str2time($timestr);
-      die("unable to parse time $line") unless $time;
-      die("unable to find email in time line") unless $email;
+      die("unable to parse time $line\n") unless $time;
+      die("unable to find email in $line\n") unless $email;
       print "Read ".time2mystr($time)."($time) for $line\n" if $verbose;
       next;
     }
@@ -153,8 +183,8 @@ while (@files) {
       # ignoring the hour due to timezone issues, but do not accept it anymore for current entries
       # we take this as one blob.
       # Accept this only for entries in 2006 and before with 00 minutes and 00 seconds
-      if ( $lt[5] > 106 || $lt[1] != 0 || $lt[0] != 0) {
-        setentry($time, $timestr, $email, $text);
+      if ($lt[5] > 106 || $lt[1] != 0 || $lt[0] != 0) {
+        setentry($time, $timestr, $email, $text, $fileno);
         $text = ""; 
         $time = undef;
         $email = undef;
@@ -163,19 +193,24 @@ while (@files) {
     }
 
     # must be text
-    $text .= $line;
-    $text .= "\n";
-  };
+    $text .= "$line\n";
+  }
   # last entry
-  setentry($time, $timestr, $email, $text);
+  setentry($time, $timestr, $email, $text, $fileno);
+  $fileno++;
 }
 
 print "Merged ouput:\n===========\n" if $verbose;
 
 # output the hash
-for my $time (sort{$b <=> $a} keys %$entries) {
-  print $seperator."\n";
-  print $entries->{$time}->{time}." - ".$entries->{$time}->{email}."\n";
-  print $entries->{$time}->{text};
+for my $time (sort {$b <=> $a} keys %entries) {
+  my %seen;
+  for my $ent (@{$entries{$time}}) {
+    $ent = decide($ent, $nfiles);
+    next unless $ent && $ent->{'fileno'};	# ignore old stuff
+    next if $seen{$ent->{text}};
+    print "$seperator\n$ent->{time} - $ent->{email}\n$ent->{text}";
+    $seen{$ent->{text}} = 1;
+  }
 }