Skip to content

Commit

Permalink
Merge cd96cd1 into cb8eba1
Browse files Browse the repository at this point in the history
  • Loading branch information
gfr10598 committed Jan 26, 2019
2 parents cb8eba1 + cd96cd1 commit 6870367
Show file tree
Hide file tree
Showing 19 changed files with 625 additions and 397 deletions.
42 changes: 38 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,45 @@

Annotation integration service provides geolocation for IPv4 and IPv6 MaxMind databases from Google Cloud Storage.

If an annotation request is dated prior to August 2017, location data will be derived from
MaxMind GeoLiteLatest databases. Otherwise, data will be provided by
MaxMind GeoLite2 databases. The discrepencies between provided databases are
provided below.

## API
### v1
This is the original, deprecated API, which includes an unwrapped array of RequestData objects. It's use is discouraged. Please use the v2 API, which has better support for batch requests, including returning the date of the Annotator used to provide the annotations.

### v2
The v2 api introduces a standard wrapper struct, beginning with a string specifying the version identifier, and an Info string that may include arbitrary request info, e.g. for use in tracing or debugging.

The v2 api is described in the api/v2 package in the api/v2 directory. The recommended GetAnnotations function is only available in the v2 package.

### Response contents
Annotatation service will respond with the following data:
- IP Address range
- Postal Code
- Latitude
- Longitude
- Continent Code
- Country Code
- Country Name
- Metro Code
- City Name

---
# Code structure
The code is divided into the following packages:

* api - handles external API, including GetAnnotations() call which handles composing and sending requests, with retries.
* handler - receives incoming requests, handles marshalling, unmarshalling, interpretation of requests.
* loader - handles files downloads and decompression
* geoloader - maintains directory of available MaxMind files, and selects which file(s) to use for a given request. (Needs a lot of renaming)
* geolite2 and legacy - handles details of interpreting MaxMind files and creating annotators.
Currently this is divided into two packages, but should be merged.
* manager - handles caching of Annotators




---
## Maxmind Dataset details
MaxMind GeoLiteLatest databases include:
1. GeoLiteCity-Blocks.csv
- StartIPNum IPv4
Expand Down
6 changes: 3 additions & 3 deletions annotator-ss.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ resources:

automatic_scaling:
# We expect negligible load, so this is unlikely to trigger.
min_num_instances: 5
max_num_instances: 20
min_num_instances: 2
max_num_instances: 10
cool_down_period_sec: 1800
cpu_utilization:
target_utilization: 0.60
target_utilization: 0.70

# Note: add a public port for GCE auto discovery by prometheus.
# TODO(dev): are any values redundant or irrelevant?
Expand Down
10 changes: 5 additions & 5 deletions annotator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@ service: annotator

# TODO(dev): adjust CPU and memory based on actual requirements.
resources:
cpu: 10
cpu: 20
# Instances support between [(cpu * 0.9) - 0.4, (cpu * 6.5) - 0.4]
# Actual memory available is exposed via GAE_MEMORY_MB environment variable.
memory_gb: 60
memory_gb: 129

# TODO - Do we need any disk? Adjust once we understand requirements.
disk_size_gb: 100

automatic_scaling:
# We expect negligible load, so this is unlikely to trigger.
min_num_instances: 5
max_num_instances: 20
min_num_instances: 2
max_num_instances: 2
cool_down_period_sec: 1800
cpu_utilization:
target_utilization: 0.60
target_utilization: 0.70

# Note: add a public port for GCE auto discovery by prometheus.
# TODO(dev): are any values redundant or irrelevant?
Expand Down
3 changes: 3 additions & 0 deletions api/v2/api-v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,13 @@ func postWithRetry(ctx context.Context, url string, encodedData []byte) (*http.R
RequestTimeHistogram.WithLabelValues("timeout").Observe(float64(time.Since(start).Nanoseconds()) / 1e6)
return nil, ctx.Err()
}

// This is a recoverable error, so we should retry.
RequestTimeHistogram.WithLabelValues("retry").Observe(float64(time.Since(start).Nanoseconds()) / 1e6)

err = waitOneSecond(ctx)
if err != nil {
RequestTimeHistogram.WithLabelValues("context").Observe(float64(time.Since(start).Nanoseconds()) / 1e6)
return nil, err
}
}
Expand Down
44 changes: 5 additions & 39 deletions geolite2/geo-g2.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ import (
"regexp"
"strconv"

"cloud.google.com/go/storage"
"github.com/m-lab/annotation-service/api"
"github.com/m-lab/annotation-service/loader"
"google.golang.org/api/iterator"
)

const (
Expand Down Expand Up @@ -236,12 +234,13 @@ func LoadIPListGLite2(reader io.Reader, idMap map[int]int) ([]IPNode, error) {
if backupIndex, err := lookupGeoID(record[2], idMap); err == nil {
index = backupIndex
} else {
// TODO There are an enormous number of these in the log. Why? What does it mean?
log.Println("Couldn't get a valid Geoname id!", record)
// TODO There are a lot of these in the log. Why? What does it mean?
// "193.200.150.0/24", "80.231.5.0/24", "86.62.30.0/24", "86.62.5.0/24"
log.Printf("Couldn't get a valid Geoname id! %#v\n", record)
//TODO: Add a prometheus metric here
}

}
// TODO - if error above, this might default to zero!!
newNode.LocationIndex = index
newNode.PostalCode = record[6]
newNode.Latitude, err = stringToFloat(record[7], "Latitude")
Expand Down Expand Up @@ -274,29 +273,6 @@ func LoadIPListGLite2(reader io.Reader, idMap map[int]int) ([]IPNode, error) {
return list, nil
}

// determineFilenameOfLatestGeolite2File will get a list of filenames
// from GCS and search through them, eventually returing either the
// latest filename or an error.
func determineFilenameOfLatestGeolite2File() (string, error) {
ctx := context.Background()
client, err := storage.NewClient(ctx)
if err != nil {
return "", err
}
prospectiveFiles := client.Bucket(api.MaxmindBucketName).Objects(ctx, &storage.Query{Prefix: api.MaxmindPrefix})
filename := ""
for file, err := prospectiveFiles.Next(); err != iterator.Done; file, err = prospectiveFiles.Next() {
if err != nil {
return "", err
}
if file.Name > filename && geoLite2Regex.MatchString(file.Name) {
filename = file.Name
}

}
return filename, nil
}

// LoadGeoLite2Dataset load the Geolite2 dataset with filename from bucket.
func LoadGeoLite2Dataset(filename string, bucketname string) (*GeoDataset, error) {
zip, err := loader.CreateZipReader(context.Background(), bucketname, filename)
Expand All @@ -313,17 +289,7 @@ func LoadGeoLite2Dataset(filename string, bucketname string) (*GeoDataset, error
log.Println("Error extracting date:", filename)
} else {
dataset.start = date
log.Println("Loaded", date.Format("20060102"), filename)
}
return dataset, nil
}

// LoadLatestGeolite2File will check GCS for the latest file, download
// it, process it, and load it into memory so that it can be easily
// searched, then it will return a pointer to that GeoDataset or an error.
func LoadLatestGeolite2File() (*GeoDataset, error) {
filename, err := determineFilenameOfLatestGeolite2File()
if err != nil {
return nil, err
}
return LoadGeoLite2Dataset(filename, api.MaxmindBucketName)
}
48 changes: 34 additions & 14 deletions geoloader/filename.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package geoloader

import (
"context"
"errors"
"flag"
"log"
"regexp"
"sort"
Expand All @@ -27,7 +29,7 @@ var earliestArchiveDate = time.Unix(1377648000, 0) // "August 28, 2013")
// provide the LatestDate() function.
// The current directory is regarded as immutable, but the pointer is dynamically updated, so accesses
// should only be done through getDirectory() and setDirectory().
var datasetDir = &directory{}
var datasetDir *directory
var datasetDirLock sync.RWMutex // lock to be held when accessing or updating datasetDir pointer.

func getDirectory() *directory {
Expand All @@ -42,8 +44,35 @@ func setDirectory(dir *directory) {
datasetDir = dir
}

var (
// ErrAnnotatorLoading is returned (externally) when an annotator is being loaded.
ErrAnnotatorLoading = errors.New("annotator is being loaded")

// These are UNEXPECTED errors!!
// ErrGoroutineNotOwner is returned when goroutine attempts to set annotator entry, but is not the owner.
ErrGoroutineNotOwner = errors.New("goroutine does not own annotator slot")
// ErrMapEntryAlreadySet is returned when goroutine attempts to set annotator, but entry is non-null.
ErrMapEntryAlreadySet = errors.New("annotator already set")
// ErrNilEntry is returned when map has a nil entry, which should never happen.
ErrNilEntry = errors.New("map entry is nil")
)

// This sets up a default directory for testing purposes.
func init() {
dir := newDirectory(10)
// Hack
if flag.Lookup("test.v") != nil {
date, _ := time.Parse("20060102", "20130828")
dir.Insert(date, "Maxmind/2013/08/28/20130828T184800Z-GeoLiteCity.dat.gz")
}
setDirectory(&dir)
}

type dateEntry struct {
date time.Time
// date and filenames are immutable.
date time.Time
// All filenames associated with this date/annotator.
// Only the first filename is currently required or used.
filenames []string
}

Expand Down Expand Up @@ -75,6 +104,7 @@ func (dir *directory) Insert(date time.Time, fn string) {
dir.entries[dateString] = entry
}

log.Println("Adding", dateString, fn)
entry.filenames = append(entry.filenames, fn)
}

Expand All @@ -92,6 +122,7 @@ func (dir *directory) latestDate() time.Time {
// Returns empty string if the directory is empty.
func (dir *directory) LastFilenameEarlierThan(date time.Time) string {
if len(dir.dates) == 0 {
log.Println("ERROR - no filenames")
return ""
}

Expand All @@ -111,7 +142,7 @@ var GeoLite2Regex = regexp.MustCompile(`Maxmind/\d{4}/\d{2}/\d{2}/\d{8}T\d{6}Z-G
var GeoLegacyRegex = regexp.MustCompile(`.*-GeoLiteCity.dat.*`)
var GeoLegacyv6Regex = regexp.MustCompile(`.*-GeoLiteCityv6.dat.*`)

// UpdateArchivedFilenames extracts the dataset filenames from downloader bucket
// UpdateArchivedFilenames updates the list of dataset filenames from GCS.
// This job is run at the beginning of deployment and daily cron job.
func UpdateArchivedFilenames() error {
old := getDirectory()
Expand All @@ -136,10 +167,6 @@ func UpdateArchivedFilenames() error {
if err != nil {
continue
}
if fileDate.Before(GeoLite2StartDate) {
// temporary hack to avoid legacy
continue
}

if !fileDate.Before(GeoLite2StartDate) && !GeoLite2Regex.MatchString(file.Name) {
continue
Expand All @@ -156,13 +183,6 @@ func UpdateArchivedFilenames() error {
return nil
}

// Latest returns the date of the latest dataset.
// May return time.Time{} if no dates have been loaded.
func LatestDatasetDate() time.Time {
dd := getDirectory()
return dd.latestDate()
}

// BestAnnotatorName returns the dataset filename for annotating the requested date.
func BestAnnotatorName(date time.Time) string {
dd := getDirectory()
Expand Down
30 changes: 30 additions & 0 deletions geoloader/filename_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,33 @@ func TestDir(t *testing.T) {
t.Error("wrong date", dir.LastFilenameEarlierThan(date("20100101", t)))
}
}

func TestBestAnnotatorName(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode")
}

err := geoloader.UpdateArchivedFilenames()
if err != nil {
t.Fatal(err)
}

tests := []struct {
date string
want string
}{
{"20170102", "Maxmind/2016/12/08/20161208T080000Z-GeoLiteCity.dat.gz"},
{"20180809", "Maxmind/2018/08/08/20180808T050355Z-GeoLite2-City-CSV.zip"},
{"20170814", "Maxmind/2017/08/08/20170808T080000Z-GeoLiteCity.dat.gz"},
{"20170902", "Maxmind/2017/09/01/20170901T004438Z-GeoLite2-City-CSV.zip"},
{"20170906", "Maxmind/2017/09/01/20170901T004438Z-GeoLite2-City-CSV.zip"},
}
for _, tt := range tests {
t.Run(tt.date, func(t *testing.T) {
d := date(tt.date, t)
if got := geoloader.BestAnnotatorName(d); got != tt.want {
t.Errorf("%s -> %v, want %v", tt.date, got, tt.want)
}
})
}
}
15 changes: 0 additions & 15 deletions geoloader/geoloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,11 @@
package geoloader

import (
"log"

"github.com/m-lab/annotation-service/api"
"github.com/m-lab/annotation-service/geolite2"
"github.com/m-lab/annotation-service/legacy"
)

// PopulateLatestData will search to the latest Geolite2 files
// available in GCS and will use them to create a new GeoDataset which
// it will place into the global scope as the latest version. It will
// do so safely with use of the currentDataMutex RW mutex. It it
// encounters an error, it will halt the program.
func GetLatestData() api.Annotator {
data, err := geolite2.LoadLatestGeolite2File()
if err != nil {
log.Fatal(err)
}
return data
}

func ArchivedLoader(filename string) (api.Annotator, error) {
if GeoLite2Regex.MatchString(filename) {
return geolite2.LoadGeoLite2Dataset(filename, api.MaxmindBucketName)
Expand Down

0 comments on commit 6870367

Please sign in to comment.