Skip to content

Commit

Permalink
Merge pull request #613 from pascalchevrel/speed
Browse files Browse the repository at this point in the history
issue #612: Improve speed and result quality for global translation memory api
  • Loading branch information
pascalchevrel committed Feb 9, 2016
2 parents d240a8b + 58c3686 commit d9793bc
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 53 deletions.
47 changes: 25 additions & 22 deletions app/classes/Transvision/ShowResults.php
Expand Up @@ -93,32 +93,35 @@ public static function getSuggestionsResults($source_strings, $target_strings, $
* Return an array of search results from our Translation Memory API
* service with a quality index based on the levenshtein distance.
*
* @param array $source_strings The source reference strings with entities as keys
* @param array $target_strings The target strings to look into with entities as keys
* @param string $search The string to search for
* @param int $max_results Optional, default to 200, the max number of results we return
* @param int $min_quality Optional, default to 0, The minimal quality index to filter result
* @param array $strings The source and target strings to look into
* @param string $search The string to search for
* @param int $max_results Optional, default to 200, the max number of results we return
* @param int $min_quality Optional, default to 0, The minimal quality index to filter result
* @return array An array of strings as [source => string, target => string, quality=> Levenshtein index]
*/
public static function getTranslationMemoryResults($source_strings, $target_strings, $search, $max_results = 200, $min_quality = 0)
public static function getTranslationMemoryResults($strings, $search, $max_results = 200, $min_quality = 0)
{
$search_results = array_values(
self::getTMXResults(array_keys($source_strings), [$source_strings, $target_strings])
);
$output = [];
if (empty($strings)) {
return [];
}

foreach ($search_results as $set) {
// We only want results for which we have a translation
if ($set[1]) {
$quality = round(Strings::levenshteinQuality($search, $set[0]), 2);

if ($quality >= $min_quality) {
$output[] = [
'source' => $set[0],
'target' => $set[1],
'quality' => $quality,
];
}
/*
Here we prepare an output array with source and target strings plus
a quality index.
$set[0] is the source string (usually English) on which we
calculate a quality index based on the Levenshtein algorithm.
$set[1] is the target string, that is the language we want
translations from.
*/
foreach ($strings as $set) {
$quality = round(Strings::levenshteinQuality($search, $set[0]), 2);

if ($quality >= $min_quality) {
$output[] = [
'source' => $set[0],
'target' => $set[1],
'quality' => $quality,
];
}
}

Expand Down
40 changes: 14 additions & 26 deletions app/models/api/translation_memory.php
Expand Up @@ -13,11 +13,11 @@
};

$repositories = ($request->parameters[2] == 'global')
? Project::getRepositories()
? Project::getLocaleRepositories($request->parameters[4])
: [$request->parameters[2]];

$source_strings_merged = [];
$target_strings_merged = [];
// This is the filtered data we will send to getTranslationMemoryResults()
$output = [];

// The search
$initial_search = Utils::cleanString($request->parameters[5]);
Expand Down Expand Up @@ -49,35 +49,23 @@

/*
We are only interested in target strings with keys in common with our
source strings. Not sending noise to getTranslationMemoryResults() has
a major performance and memory impact.
source strings.
*/
$target_strings = array_intersect_key(
Utils::getRepoStrings($request->parameters[4], $repository),
$source_strings
);

/*
We are not interested in keeping duplicate strings that have
different keys because this API does not take into account the
frequency of matches but the similarity of the strings.
*/
$target_strings = array_unique($target_strings);

/*
The + operator is slightly faster than array_merge and also easier
to read. The functional difference doesn't matter in this case
(http://stackoverflow.com/questions/7059721/array-merge-versus/27717809#27717809)
*/
$source_strings_merged += $source_strings;
$target_strings_merged += $target_strings;
$target_strings = Utils::getRepoStrings($request->parameters[4], $repository);

foreach ($source_strings as $key => $value) {
if (isset($target_strings[$key]) && ! empty($target_strings[$key])) {
$output[] = [
$value,
$target_strings[$key],
];
}
}
unset($source_strings, $target_strings);
}

return $json = ShowResults::getTranslationMemoryResults(
$source_strings_merged,
$target_strings_merged,
$output,
$initial_search,
$get_option('max_results'), // Cap results with the ?max_results=number option
$get_option('min_quality') // Optional quality threshold defined by ?min_quality=50
Expand Down
16 changes: 11 additions & 5 deletions tests/units/Transvision/ShowResults.php
Expand Up @@ -44,6 +44,13 @@ public function getTranslationMemoryResultsDP()
$source = $tmx;
include TMX . 'fr/cache_fr_central.php';
$target = $tmx;

foreach ($source as $key => $value) {
if (isset($target[$key])) {
$strings[] = [$value, $target[$key]];
}
}

$results = [
[
'source' => 'Bookmark',
Expand Down Expand Up @@ -72,8 +79,7 @@ public function getTranslationMemoryResultsDP()

return [
[
$source,
$target,
$strings,
'Bookmark',
$results,
],
Expand All @@ -83,12 +89,12 @@ public function getTranslationMemoryResultsDP()
/**
* @dataProvider getTranslationMemoryResultsDP
*/
public function testGetTranslationMemoryResults($a, $b, $c, $d)
public function testGetTranslationMemoryResults($a, $b, $c)
{
$obj = new _ShowResults();
$this
->array($obj->getTranslationMemoryResults($a, $b, $c, 4))
->isEqualTo($d);
->array($obj->getTranslationMemoryResults($a, $b, 4))
->isEqualTo($c);
}

public function formatEntityDP()
Expand Down

0 comments on commit d9793bc

Please sign in to comment.