Skip to content

Commit

Permalink
issue #612: Improve speed and result quality for global translation m…
Browse files Browse the repository at this point in the history
…emory api

* For a global search, don't loop through all repositories, only those a locale supports (major perf gain for small locales)
* Simplify ShowResults::getTranslationMemoryResults(), send a prepared array without entities as keys
* Ignore empty localized strings in search results (happens when the entity is in the repo but empty)

For locales that do everything, this has no perf impact, for most locales that will give a small perf boost (most locales don't do mozilla-central), for locales that have less repositories (just started and only have Aurora for example, or only translating Gaia, or not translating Firefox for iOS, the perf gain can be significant).
For Guarani for example, request times are divided by 5 on my machine compared to our current code. Same for memory use.
  • Loading branch information
pascalchevrel committed Feb 9, 2016
1 parent d240a8b commit 58c3686
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 53 deletions.
47 changes: 25 additions & 22 deletions app/classes/Transvision/ShowResults.php
Expand Up @@ -93,32 +93,35 @@ public static function getSuggestionsResults($source_strings, $target_strings, $
* Return an array of search results from our Translation Memory API
* service with a quality index based on the levenshtein distance.
*
* @param array $source_strings The source reference strings with entities as keys
* @param array $target_strings The target strings to look into with entities as keys
* @param string $search The string to search for
* @param int $max_results Optional, default to 200, the max number of results we return
* @param int $min_quality Optional, default to 0, The minimal quality index to filter result
* @param array $strings The source and target strings to look into
* @param string $search The string to search for
* @param int $max_results Optional, default to 200, the max number of results we return
* @param int $min_quality Optional, default to 0, The minimal quality index to filter result
* @return array An array of strings as [source => string, target => string, quality=> Levenshtein index]
*/
public static function getTranslationMemoryResults($source_strings, $target_strings, $search, $max_results = 200, $min_quality = 0)
public static function getTranslationMemoryResults($strings, $search, $max_results = 200, $min_quality = 0)
{
$search_results = array_values(
self::getTMXResults(array_keys($source_strings), [$source_strings, $target_strings])
);
$output = [];
if (empty($strings)) {
return [];
}

foreach ($search_results as $set) {
// We only want results for which we have a translation
if ($set[1]) {
$quality = round(Strings::levenshteinQuality($search, $set[0]), 2);

if ($quality >= $min_quality) {
$output[] = [
'source' => $set[0],
'target' => $set[1],
'quality' => $quality,
];
}
/*
Here we prepare an output array with source and target strings plus
a quality index.
$set[0] is the source string (usually English) on which we
calculate a quality index based on the Levenshtein algorithm.
$set[1] is the target string, that is the language we want
translations from.
*/
foreach ($strings as $set) {
$quality = round(Strings::levenshteinQuality($search, $set[0]), 2);

if ($quality >= $min_quality) {
$output[] = [
'source' => $set[0],
'target' => $set[1],
'quality' => $quality,
];
}
}

Expand Down
40 changes: 14 additions & 26 deletions app/models/api/translation_memory.php
Expand Up @@ -13,11 +13,11 @@
};

$repositories = ($request->parameters[2] == 'global')
? Project::getRepositories()
? Project::getLocaleRepositories($request->parameters[4])
: [$request->parameters[2]];

$source_strings_merged = [];
$target_strings_merged = [];
// This is the filtered data we will send to getTranslationMemoryResults()
$output = [];

// The search
$initial_search = Utils::cleanString($request->parameters[5]);
Expand Down Expand Up @@ -49,35 +49,23 @@

/*
We are only interested in target strings with keys in common with our
source strings. Not sending noise to getTranslationMemoryResults() has
a major performance and memory impact.
source strings.
*/
$target_strings = array_intersect_key(
Utils::getRepoStrings($request->parameters[4], $repository),
$source_strings
);

/*
We are not interested in keeping duplicate strings that have
different keys because this API does not take into account the
frequency of matches but the similarity of the strings.
*/
$target_strings = array_unique($target_strings);

/*
The + operator is slightly faster than array_merge and also easier
to read. The functional difference doesn't matter in this case
(http://stackoverflow.com/questions/7059721/array-merge-versus/27717809#27717809)
*/
$source_strings_merged += $source_strings;
$target_strings_merged += $target_strings;
$target_strings = Utils::getRepoStrings($request->parameters[4], $repository);

foreach ($source_strings as $key => $value) {
if (isset($target_strings[$key]) && ! empty($target_strings[$key])) {
$output[] = [
$value,
$target_strings[$key],
];
}
}
unset($source_strings, $target_strings);
}

return $json = ShowResults::getTranslationMemoryResults(
$source_strings_merged,
$target_strings_merged,
$output,
$initial_search,
$get_option('max_results'), // Cap results with the ?max_results=number option
$get_option('min_quality') // Optional quality threshold defined by ?min_quality=50
Expand Down
16 changes: 11 additions & 5 deletions tests/units/Transvision/ShowResults.php
Expand Up @@ -44,6 +44,13 @@ public function getTranslationMemoryResultsDP()
$source = $tmx;
include TMX . 'fr/cache_fr_central.php';
$target = $tmx;

foreach ($source as $key => $value) {
if (isset($target[$key])) {
$strings[] = [$value, $target[$key]];
}
}

$results = [
[
'source' => 'Bookmark',
Expand Down Expand Up @@ -72,8 +79,7 @@ public function getTranslationMemoryResultsDP()

return [
[
$source,
$target,
$strings,
'Bookmark',
$results,
],
Expand All @@ -83,12 +89,12 @@ public function getTranslationMemoryResultsDP()
/**
* @dataProvider getTranslationMemoryResultsDP
*/
public function testGetTranslationMemoryResults($a, $b, $c, $d)
public function testGetTranslationMemoryResults($a, $b, $c)
{
$obj = new _ShowResults();
$this
->array($obj->getTranslationMemoryResults($a, $b, $c, 4))
->isEqualTo($d);
->array($obj->getTranslationMemoryResults($a, $b, 4))
->isEqualTo($c);
}

public function formatEntityDP()
Expand Down

0 comments on commit 58c3686

Please sign in to comment.