extra/_largecrawl/xmlstream.go

// Package xmlstream implements a lightweight XML scanner on top of encoding/xml.
// It keeps the flexibility of xml.Unmarshal while allowing the parsing of huge XML files.
//
// TODO: extract more explicit info from the XML, like DOI, other identifiers, etc.
package main

import (
	"bufio"
	"encoding/json"
	"encoding/xml"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"reflect"
	"regexp"
	"strings"
	"time"
)

var (
	debug       = flag.Bool("d", false, "debug output")
	skipDeleted = flag.Bool("D", false, "skip delete records")

	doiRe  = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*`)
	issnRe = regexp.MustCompile(`[0-9]{4,4}-?[0-9]{3,3}[0-9xX]`)
)

// Scanner provides a way to read a stream of XML data. It uses an xml.Decoder internally to step
// through the XML elements of the stream.
type Scanner struct {
	Decoder    *xml.Decoder
	element    interface{}
	nameToType map[string]reflect.Type // map xml local name to element's type
	err        error
}

// NewScanner returns a new Scanner to read from r.
// Tags must be struct objects or pointer to struct objects, as defined by encoding/xml:
// http://golang.org/pkg/encoding/xml/#Unmarshal
func NewScanner(r io.Reader, tags ...interface{}) *Scanner {
	s := Scanner{
		Decoder:    xml.NewDecoder(r),
		nameToType: make(map[string]reflect.Type, len(tags)),
	}
	s.Decoder.Strict = false
	// Map the xml local name of an element to its underlying type.
	for _, tag := range tags {
		v := reflect.ValueOf(tag)
		if v.Kind() == reflect.Ptr {
			v = v.Elem()
		}
		t := v.Type()
		name := elementName(v)
		s.nameToType[name] = t
	}
	return &s
}

func elementName(v reflect.Value) string {
	t := v.Type()
	if t.Kind() != reflect.Struct {
		panic(fmt.Errorf("Tags must be of kind Struct but got %s", t.Kind()))
	}
	name := t.Name()
	for i := 0; i < v.NumField(); i++ {
		field := t.Field(i)
		if field.Name == "XMLName" || field.Type.String() == "xml.Name" {
			if field.Tag.Get("xml") != "" {
				name = field.Tag.Get("xml")
			}
		}
	}
	return name
}

// Scan advances the Scanner to the next XML element matching one of the struct passed to NewReader.
// This element will then be available through the Element method.
// It returns false when the scan stops, either by reaching the end of the input or an error.
// After Scan returns false, the Err method will return any error that occurred
// during scanning, except that if it was io.EOF, Err will return nil.
func (s *Scanner) Scan() bool {
	if (*s).err != nil {
		return false
	}
	for {
		// Read next token.
		token, err := (*s).Decoder.Token()
		if err != nil {
			(*s).element = nil
			(*s).err = err
			return false
		}
		// Inspect the type of the token.
		switch el := token.(type) {
		case xml.StartElement:
			// Read the element name and compare with the XML element.
			if elementType, ok := (*s).nameToType[el.Name.Local]; ok {
				// create a new element
				element := reflect.New(elementType).Interface()
				// Decode a whole chunk of following XML.
				err := (*s).Decoder.DecodeElement(element, &el)
				(*s).element = element
				(*s).err = err
				return err == nil
			}
		}
	}
}

// Element returns a pointer to the most recent struct object generated by a call to Scan.
// The type of this struct matches the type of one of the custom struct passed to NewReader.
func (s *Scanner) Element() interface{} {
	return (*s).element
}

// Err returns the first non-EOF error that was encountered by the Scanner.
func (s *Scanner) Err() error {
	if (*s).err != nil && (*s).err != io.EOF {
		return (*s).err
	}
	return nil
}

// Record was generated 2020-03-17 16:11:30 by tir on trieste.
type Record struct {
	XMLName xml.Name `xml:"record"`
	Text    string   `xml:",chardata"`
	Header  struct {
		Text       string   `xml:",chardata"`
		Status     string   `xml:"status,attr"`
		Identifier string   `xml:"identifier"`
		Datestamp  string   `xml:"datestamp"`
		SetSpec    []string `xml:"setSpec"`
	} `xml:"header"`
	Metadata struct {
		Text string `xml:",chardata"`
		Dc   struct {
			Text           string `xml:",chardata"`
			OaiDc          string `xml:"oai_dc,attr"`
			Dc             string `xml:"dc,attr"`
			Xsi            string `xml:"xsi,attr"`
			SchemaLocation string `xml:"schemaLocation,attr"`
			Doc            string `xml:"doc,attr"`
			Xmlns          string `xml:"xmlns,attr"`
			Cm             string `xml:"cm,attr"`
			Cs             string `xml:"cs,attr"`
			Spectrum       string `xml:"spectrum,attr"`
			Ns2            string `xml:"ns2,attr"`
			Title          []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
				Sub  string `xml:"sub"`
			} `xml:"title"`
			Creator []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
				ID   string `xml:"id,attr"`
			} `xml:"creator"`
			Description []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"description"`
			Publisher []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"publisher"`
			Date []struct {
				Text string `xml:",chardata"`
				Dc   string `xml:"dc,attr"`
				Lang string `xml:"lang,attr"`
			} `xml:"date"`
			Type []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"type"`
			Format []struct {
				Text string `xml:",chardata"`
				Dc   string `xml:"dc,attr"`
				Lang string `xml:"lang,attr"`
			} `xml:"format"`
			Identifier []struct {
				Text   string `xml:",chardata"`
				Dc     string `xml:"dc,attr"`
				Jtitle string `xml:"jtitle"`
			} `xml:"identifier"`
			Source []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
			} `xml:"source"`
			Language []struct {
				Text string `xml:",chardata"`
				Dc   string `xml:"dc,attr"`
				Lang string `xml:"lang,attr"`
			} `xml:"language"`
			Relation []struct {
				Text string `xml:",chardata"`
				Dc   string `xml:"dc,attr"`
				Lang string `xml:"lang,attr"`
			} `xml:"relation"`
			Rights []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"rights"`
			Contributor []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"contributor"`
			Subject []struct {
				Text string `xml:",chardata"`
				Lang string `xml:"lang,attr"`
				Dc   string `xml:"dc,attr"`
			} `xml:"subject"`
			Coverage []struct {
				Text     string `xml:",chardata"`
				Lang     string `xml:"lang,attr"`
				Resource string `xml:"resource,attr"`
			} `xml:"coverage"`
			IdentifierURIFulltext []string `xml:"identifier.uri.fulltext"`
			Audience              struct {
				Text string `xml:",chardata"`
				Dc   string `xml:"dc,attr"`
			} `xml:"audience"`
			Doi    string `xml:"doi"`
			Extent string `xml:"extent"`
		} `xml:"dc"`
	} `xml:"metadata"`
	About string `xml:"about"`
}

// Info is some information out of OAI raw XML.
type Info struct {
	OAI       string   `json:"oai,omitempty"`
	Status    string   `json:"status,omitempty"`
	Datestamp string   `json:"datestamp,omitempty"`
	Sets      []string `json:"sets,omitempty"`

	Contributors          []string `json:"contributors,omitempty"`
	Coverage              []string `json:"coverage,omitempty"`
	Creators              []string `json:"creators,omitempty"`
	Descriptions          []string `json:"descriptions,omitempty"`
	DOI                   []string `json:"doi,omitempty"`
	Dates                 []string `json:"dates,omitempty"`
	Formats               []string `json:"formats,omitempty"`
	ISSN                  []string `json:"issn,omitempty"`
	IdentifierURIFulltext []string `json:"fulltext_uri,omitempty"`
	Identifiers           []string `json:"ids,omitempty"` // TODO: make this more granular
	Languages             []string `json:"languages,omitempty"`
	Links                 []string `json:"urls,omitempty"`
	Publishers            []string `json:"publishers,omitempty"`
	Relations             []string `json:"relations,omitempty"`
	Rights                []string `json:"rights,omitempty"`
	Sources               []string `json:"sources,omitempty"`
	Subjects              []string `json:"subjects,omitempty"`
	Titles                []string `json:"titles,omitempty"`
	Types                 []string `json:"types,omitempty"`
}

func (record *Record) extractInfo() (*Info, error) {
	dc := record.Metadata.Dc
	// Some things we would get out.
	var contributors, coverage, creators, descriptions, formats, dois, ids, issns,
		languages, publishers, rels, rights, sources, subjects, titles, types, urls []string

	for _, v := range dc.Contributor {
		if v.Text == "" {
			continue
		}
		contributors = appendUnique(contributors, v.Text)
	}
	for _, v := range dc.Coverage {
		if v.Text == "" {
			continue
		}
		coverage = appendUnique(coverage, v.Text)
	}
	for _, v := range dc.Creator {
		if v.Text == "" {
			continue
		}
		creators = appendUnique(creators, v.Text)
	}
	for _, v := range dc.Description {
		if v.Text == "" {
			continue
		}
		descriptions = appendUnique(descriptions, v.Text)
	}
	for _, v := range dc.Format {
		if v.Text == "" {
			continue
		}
		formats = appendUnique(formats, v.Text)
	}
	for _, v := range dc.Identifier {
		if v.Text == "" {
			continue
		}
		ids = appendUnique(ids, v.Text)
	}
	for _, v := range dc.Language {
		if v.Text == "" {
			continue
		}
		languages = appendUnique(languages, v.Text)
	}
	for _, v := range dc.Publisher {
		if v.Text == "" {
			continue
		}
		publishers = appendUnique(publishers, v.Text)
	}
	for _, v := range dc.Rights {
		if v.Text == "" {
			continue
		}
		rights = appendUnique(rights, v.Text)
	}
	for _, v := range dc.Source {
		if v.Text == "" {
			continue
		}
		sources = appendUnique(sources, v.Text)
	}
	for _, v := range dc.Subject {
		if v.Text == "" {
			continue
		}
		subjects = appendUnique(subjects, v.Text)
	}
	for _, v := range dc.Type {
		if v.Text == "" {
			continue
		}
		types = appendUnique(types, v.Text)
	}
	for _, v := range dc.Relation {
		if v.Text == "" {
			continue
		}
		rels = appendUnique(rels, v.Text)
	}
	for _, v := range dc.Title {
		if v.Text == "" {
			continue
		}
		titles = appendUnique(titles, v.Text)
	}
	if dc.Doi != "" {
		dois = appendUnique(dois, dc.Doi)
	}
	// Find URL, DOI, ISSN, and other structured data.
	for _, v := range ids {
		switch {
		case strings.HasPrefix(v, "http"):
			urls = appendUnique(urls, v)
		case doiRe.MatchString(v):
			dois = appendUnique(dois, doiRe.FindString(v))
		case issnRe.MatchString(v):
			issns = appendUnique(issns, issnRe.FindString(v))
		}
	}
	for _, v := range sources {
		switch {
		case strings.HasPrefix(v, "http"):
			urls = appendUnique(urls, v)
		case doiRe.MatchString(v):
			dois = appendUnique(dois, doiRe.FindString(v))
		case issnRe.MatchString(v):
			issns = appendUnique(issns, issnRe.FindString(v))
		}
	}
	info := Info{
		OAI:                   record.Header.Identifier,
		Datestamp:             record.Header.Datestamp,
		Descriptions:          descriptions,
		Sets:                  record.Header.SetSpec,
		Status:                record.Header.Status,
		Contributors:          contributors,
		Coverage:              coverage,
		Creators:              creators,
		DOI:                   dois,
		Formats:               formats,
		ISSN:                  issns,
		IdentifierURIFulltext: dc.IdentifierURIFulltext,
		Identifiers:           ids,
		Languages:             languages,
		Links:                 urls,
		Publishers:            publishers,
		Relations:             rels,
		Rights:                rights,
		Subjects:              subjects,
		Titles:                titles,
		Types:                 types,
	}
	return &info, nil
}

func appendUnique(ss []string, v string) []string {
	for _, s := range ss {
		if s == v {
			return ss
		}
	}
	ss = append(ss, v)
	return ss
}

func main() {
	flag.Parse()
	scanner := NewScanner(os.Stdin, new(Record))
	bw := bufio.NewWriter(os.Stdout)
	defer bw.Flush()
	enc := json.NewEncoder(bw)
	stats := map[string]int{
		"total":     0,
		"skipped":   0,
		"deleted":   0,
		"encoded":   0,
		"elapsed_s": 0,
		"rps":       0,
	}
	started := time.Now()
	for scanner.Scan() {
		tag := scanner.Element()
		switch e := tag.(type) {
		case *Record:
			stats["total"]++
			if *debug {
				fmt.Printf("found record: %v\n", e.Header.Identifier)
			} else {
				if *skipDeleted && e.Header.Status == "deleted" {
					stats["deleted"]++
					continue
				}
				info, err := e.extractInfo()
				if len(info.Titles) == 0 && len(e.Header.Identifier) == 0 {
					stats["skipped"]++
					continue
				}
				if err != nil {
					log.Fatal(err)
				}
				if err := enc.Encode(info); err != nil {
					log.Fatal(err)
				}
				stats["encoded"]++
			}
		}
	}
	if err := scanner.Err(); err != nil {
		log.Fatal("error while scanning XML: %v\n", err)
	}
	stats["elapsed_s"] = int(time.Since(started).Seconds())
	if stats["elapsed_s"] > 0 {
		stats["rps"] = int(stats["total"] / stats["elapsed_s"])
	}
	b, err := json.Marshal(stats)
	if err != nil {
		log.Fatal(err)
	}
	log.Println(string(b))
}