Skip to content

Commit

Permalink
Improve handling and detection of region codes and names (#20420)
Browse files Browse the repository at this point in the history
* reformat region list

* Provide a list of current and previous region names

* update tests

* use a fuzzy match for searching for region name

* add back some revised regions and regions names

* Use db ip database to identify possible alt names

* update region mappings

* fix test

* Fix typos

* Ensure there's a new line after progress bar line

---------

Co-authored-by: Michal Kleiner <michal@innocraft.com>
  • Loading branch information
sgiehl and michalkleiner committed Jun 6, 2023
1 parent f6a668a commit 742cbe1
Show file tree
Hide file tree
Showing 9 changed files with 46,682 additions and 4,997 deletions.
176 changes: 136 additions & 40 deletions plugins/GeoIp2/Commands/UpdateRegionCodes.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,29 @@
use Piwik\Development;
use Piwik\Http;
use Piwik\Plugin\ConsoleCommand;
use Piwik\Plugins\GeoIp2\LocationProvider\GeoIp2\Php;
use Piwik\Plugins\UserCountry\LocationProvider;

/**
* This command can be used to update the list of regions and their names that Matomo knows about.
* A list of iso regions is fetched from the iso-codes project. This list will then be used to update the regions array
* located in data/isoRegionNames
* - new regions will be added as current
* - changed names will be updated (adding the previous name as alternate name)
* - removed regions will be kept, but marked as not current
*
* Additionally, this command can be used to add regions that are returned by DB IP GeoIP database
* As the DBIP Lite database only contains region names, but no region codes, we try to map the returned name to a known
* region. As DBIP in some cases returns names, that differ from the official region name, Matomo would be unable to
* store those regions. To provide a better mapping this command allows to provide the --db-ip-csv option.
* This option should provide the path to the DB-IP city lite database in CSV format. In addition, the paid DB-IP city
* (mmdb) database should be configured in Matomo as location provider.
* The command will then iterate through all IP ranges defined in the CSV database and query a look-up using the
* location provider. The returned region iso code and region name is then compared with those included in the regions
* array. Missing regions will be added (as not current), mismatching names will be added as alternate names.
* This will ensure that regions returned by the lite database should be mapped correctly.
* Attention: Using this option will take a couple of hours to process.
*/
class UpdateRegionCodes extends ConsoleCommand
{
public $source = 'https://salsa.debian.org/iso-codes-team/iso-codes/-/raw/main/data/iso_3166-2.json';
Expand All @@ -20,6 +42,7 @@ protected function configure()
{
$this->setName('usercountry:update-region-codes');
$this->setDescription("Updates the ISO region names");
$this->addOptionalValueOption('db-ip-csv', null, 'Uses the provided DB IP CSV database to iterate over all included IP ranges.');
}

public function isEnabled()
Expand All @@ -33,6 +56,7 @@ public function isEnabled()
protected function doExecute(): int
{
$output = $this->getOutput();
$input = $this->getInput();

$regionsFile = __DIR__ . '/../data/isoRegionNames.php';

Expand Down Expand Up @@ -60,63 +84,67 @@ protected function doExecute(): int

$newRegions = [];
foreach ($regionData['3166-2'] as $region) {

// some fixes of incorrect region codes
if ($region['code'] === 'SS-EE8') {
$region['code'] = 'SS-EE';
}
if ($region['code'] === 'ML-BK0') {
$region['code'] = 'ML-BKO';
}
if ($region['code'] === 'IQ-SW') {
$region['code'] = 'IQ-SU';
}
if ($region['code'] === 'MU-RP') {
$region['code'] = 'MU-RR';
}

list($countryCode, $regionCode) = explode('-', $region['code']);
$newRegions[$countryCode][$regionCode] = $region['name'];
}

$currentRegions = include $regionsFile;

// regions for Saint Lucia missing in iso-codes
if (empty($newRegions['LC']) && !empty($currentRegions['LC'])) {
$newRegions['LC'] = $currentRegions['LC'];
$newRegions[$countryCode][$regionCode] = [
'name' => $region['name'],
'altNames' => [],
'current' => true
];
}

// regions for Republic of Côte d'Ivoire still outdated in iso-codes
$newRegions['CI'] = $currentRegions['CI'];

// regions missing in iso-codes
$isoCodesMissing = [
'AR-F', 'BI-MY', 'DO-31', 'DO-32', 'DO-33', 'DO-34', 'DO-35', 'DO-36', 'DO-37', 'DO-38', 'DO-39', 'DO-40', 'DO-41', 'DO-42',
'EG-LX', 'HT-NI', 'IQ-KI', 'IR-32', 'KG-GO', 'KZ-BAY', 'LR-GP', 'LR-RG', 'MK-85', 'QA-SH', 'SD-GK', 'SI-212', 'SI-213',
'TH-38', 'TJ-DU', 'TJ-RA', 'TT-MRC', 'TT-TOB', 'YE-HU'
];
ksort($newRegions);

foreach ($isoCodesMissing as $isoCode) {
list($countryCode, $regionCode) = explode('-', $isoCode);
$currentRegions = include $regionsFile;

if (!empty($newRegions[$countryCode][$regionCode])) {
continue; // skip if it was already icnluded
foreach ($currentRegions as $countryCode => $regions) {
foreach ($regions as $regionCode => $regionData) {
if (isset($newRegions[$countryCode][$regionCode])) {
$newRegions[$countryCode][$regionCode]['altNames'] = $regionData['altNames'];

if (
$newRegions[$countryCode][$regionCode]['name'] !== $regionData['name']
&& !in_array($regionData['name'], $newRegions[$countryCode][$regionCode]['altNames'])
) {
$newRegions[$countryCode][$regionCode]['altNames'][] = $regionData['name'];
}

if (($key = array_search($newRegions[$countryCode][$regionCode]['name'], $newRegions[$countryCode][$regionCode]['altNames'])) !== false) {
unset($newRegions[$countryCode][$regionCode]['altNames'][$key]);
$newRegions[$countryCode][$regionCode]['altNames'] = array_values($newRegions[$countryCode][$regionCode]['altNames']);
}
} else {
$newRegions[$countryCode][$regionCode] = $regionData;
$newRegions[$countryCode][$regionCode]['current'] = false;
}
}

$newRegions[$countryCode][$regionCode] = $currentRegions[$countryCode][$regionCode];
ksort($newRegions[$countryCode], SORT_NATURAL);
}

ksort($newRegions);
$dbIpCsvFile = $input->getOption('db-ip-csv');

if (!empty($dbIpCsvFile)) {
$this->enrichWithDbIpRegions($dbIpCsvFile, $newRegions);
}

if (json_encode($newRegions) === json_encode($currentRegions)) {
$output->writeln('');
$output->writeln('Everything already up to date <fg=green>✓</>');
return self::SUCCESS;
}

$content = <<<CONTENT
<?php
// Generated file containing all ISO region codes and names
// The below list contains all ISO region codes and names known to Matomo
// Format:
// <CountryCode> => [
// <RegionCode> => [
// 'name' => <CurrentISOName>
// 'altNames' => [
// // list of previous names or names used by GeoIP providers like db-ip
// ],
// 'current' => <bool> indicating if the iso code is currently used
// ]
// ]
return
CONTENT;

Expand All @@ -128,4 +156,72 @@ protected function doExecute(): int

return self::SUCCESS;
}

private function enrichWithDbIpRegions(string $dbIpCsvFile, array &$regions)
{
$output = $this->getOutput();
$output->writeln('Start looking through GeoIP database for missing region names');

$php = new Php();

$supportedInfo = $php->getSupportedLocationInfo();

if (empty($supportedInfo[LocationProvider::REGION_CODE_KEY])) {
$output->writeln(' <fg=red>X Region codes not supported by currently used GeoIP database. Skipping.</>');
return;
}

$output->writeln('Iterating through all IPv4 addresses...');

$this->initProgressBar(6396645);

$handle = fopen($dbIpCsvFile, 'r');

while(!feof($handle)){
$csv = str_getcsv(fgets($handle));
$ip = $csv[0] ?? '';

$this->advanceProgressBar();

if (empty($ip)) {
continue;
}

$location = $php->getLocation(['ip' => $ip]);

$countryCode = $location[LocationProvider::COUNTRY_CODE_KEY] ?? null;
$regionCode = $location[LocationProvider::REGION_CODE_KEY] ?? null;
$regionName = $location[LocationProvider::REGION_NAME_KEY] ?? null;

if (empty($countryCode) || empty($regionCode) || empty($regionName)) {
continue;
}

if (!array_key_exists($countryCode, $regions)) {
continue;
}

if (!array_key_exists($regionCode, $regions[$countryCode])) {
$output->writeln('');
$output->writeln("Adding missing region $regionName ($regionCode) for country $countryCode <fg=green>✓</>");
$regions[$countryCode][$regionCode] = [
'name' => $regionName,
'altNames' => [],
'current' => false,
];
} else {
if (
$regionName !== $regions[$countryCode][$regionCode]['name']
&& !in_array($regionName, $regions[$countryCode][$regionCode]['altNames'])
) {
$output->writeln('');
$output->writeln("Adding alternate region name $regionName to region {$regions[$countryCode][$regionCode]['name']} ($regionCode) for country $countryCode <fg=green>✓</>");
$regions[$countryCode][$regionCode]['altNames'][] = $regionName;
}
}
}

fclose($handle);
$this->finishProgressBar();
}
}
18 changes: 18 additions & 0 deletions plugins/GeoIp2/LocationProvider/GeoIp2.php
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,24 @@ public static function getRegionNameFromCodes($countryCode, $regionCode)
* @return array
*/
public static function getRegionNames()
{
$regionsByCountry = self::getRegions();

foreach ($regionsByCountry as $countryCode => &$regions) {
foreach ($regions as $regionCode => &$regionData) {
$regionData = $regionData['name'];
}
}

return $regionsByCountry;
}

/**
* Returns an array of region names mapped by country code & region code.
*
* @return array
*/
public static function getRegions()
{
if (is_null(self::$regionNames)) {
self::$regionNames = require_once __DIR__ . '/../data/isoRegionNames.php';
Expand Down
37 changes: 33 additions & 4 deletions plugins/GeoIp2/LocationProvider/GeoIp2/Php.php
Original file line number Diff line number Diff line change
Expand Up @@ -284,21 +284,43 @@ protected function setCityResults($lookupResult, &$result)
*/
protected function determineRegionIsoCodeByNameAndCountryCode($regionName, $countryCode)
{
$regionNames = self::getRegionNames();
$regionNames = self::getRegions();

if (empty($regionNames[$countryCode])) {
return '';
}

foreach ($regionNames[$countryCode] as $isoCode => $name) {
if (mb_strtolower($name) === mb_strtolower($regionName)) {
foreach ($regionNames[$countryCode] as $isoCode => $regionData) {
if ($this->fuzzyMatch($regionData['name'], $regionName)) {
return $isoCode;
}
if (isset($regionData['altNames']) && count($regionData['altNames'])) {
foreach ($regionData['altNames'] as $altName) {
if ($this->fuzzyMatch($altName, $regionName)) {
return $isoCode;
}
}
}
}

return '';
}

private function fuzzyMatch(string $str1, string $str2): bool
{
if (strtolower($str1) === strtolower($str2)) {
return true;
}

// try converting umlauts to closted ascii char if iconv is available
if (function_exists('iconv')) {
$str1 = iconv('UTF-8', 'ASCII//TRANSLIT', $str1);
$str2 = iconv('UTF-8', 'ASCII//TRANSLIT', $str2);
}

return strtolower($str1) === strtolower($str2);
}

protected function determinSubdivision($subdivisions, $countryCode)
{
if (in_array($countryCode, ['GB'])) {
Expand Down Expand Up @@ -355,7 +377,6 @@ public function getSupportedLocationInfo()
switch ($reader->metadata()->databaseType) {
case 'GeoIP2-Enterprise':
case 'GeoLite2-City':
case 'DBIP-City-Lite':
case 'DBIP-City':
case 'GeoIP2-City':
case 'GeoIP2-City-Africa':
Expand All @@ -374,6 +395,14 @@ public function getSupportedLocationInfo()
$result[self::LATITUDE_KEY] = true;
$result[self::LONGITUDE_KEY] = true;
break;
case 'DBIP-City-Lite':
$result[self::REGION_CODE_KEY] = false;
$result[self::REGION_NAME_KEY] = true;
$result[self::CITY_NAME_KEY] = true;
$result[self::POSTAL_CODE_KEY] = false;
$result[self::LATITUDE_KEY] = true;
$result[self::LONGITUDE_KEY] = true;
break;
}
}

Expand Down

0 comments on commit 742cbe1

Please sign in to comment.