-
Notifications
You must be signed in to change notification settings - Fork 7
/
record.go
321 lines (296 loc) · 10.5 KB
/
record.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
package thieme
import (
"encoding/base64"
"encoding/xml"
"fmt"
"strings"
"time"
"github.com/kennygrant/sanitize"
"github.com/miku/span"
"github.com/miku/span/assetutil"
"github.com/miku/span/formats/finc"
)
const (
SourceID = "60"
Format = "ElectronicArticle"
Collection = "Thieme Journals"
Genre = "article"
DefaultRefType = "EJOUR"
)
func leftPad(s string, padStr string, overallLen int) string {
padCountInt := 1 + ((overallLen - len(padStr)) / len(padStr))
var retStr = strings.Repeat(padStr, padCountInt) + s
return retStr[(len(retStr) - overallLen):]
}
// LanguageMap maps two letter codes and full names to three letter codes.
var LanguageMap = assetutil.MustLoadStringMap("assets/doaj/language-iso-639-3.json")
// Record was generated 2018-02-15 13:37:41 by tir on hayiti.
type Record struct {
XMLName xml.Name `xml:"record"`
Text string `xml:",chardata"`
Header struct {
Text string `xml:",chardata"`
Status string `xml:"status,attr"`
Identifier struct {
Text string `xml:",chardata"` // 10.1055-s-0029-1195170, 1...
} `xml:"identifier"`
Datestamp struct {
Text string `xml:",chardata"` // 2013-03-13T06:02:46Z, 201...
} `xml:"datestamp"`
SetSpec struct {
Text string `xml:",chardata"` // journalarticles, journala...
} `xml:"setSpec"`
} `xml:"header"`
Metadata struct {
Text string `xml:",chardata"`
Article []struct {
Text string `xml:",chardata"`
Xsi string `xml:"xsi,attr"`
NoNamespaceSchemaLocation string `xml:"noNamespaceSchemaLocation,attr"`
Lang string `xml:"lang,attr"`
ArticleType string `xml:"article-type,attr"`
Front struct {
Text string `xml:",chardata"`
JournalMeta struct {
Text string `xml:",chardata"`
JournalID struct {
Text string `xml:",chardata"`
} `xml:"journal-id"`
JournalTitleGroup struct {
Text string `xml:",chardata"`
JournalTitle struct {
Text string `xml:",chardata"` // Dtsch med Wochenschr, Dts...
} `xml:"journal-title"`
} `xml:"journal-title-group"`
ISSN []struct {
Text string `xml:",chardata"` // 0012-0472, 1439-4413, 001...
PubType string `xml:"pub-type,attr"`
} `xml:"issn"`
Publisher struct {
Text string `xml:",chardata"`
PublisherName struct {
Text string `xml:",chardata"` // Georg Thieme Verlag Stutt...
} `xml:"publisher-name"`
} `xml:"publisher"`
} `xml:"journal-meta"`
ArticleMeta struct {
Text string `xml:",chardata"`
ArticleID struct {
Text string `xml:",chardata"` // 10.1055/s-0029-1195170, 1...
PubIDType string `xml:"pub-id-type,attr"`
} `xml:"article-id"`
ArticleCategories struct {
Text string `xml:",chardata"`
SubjGroup struct {
Text string `xml:",chardata"`
Subject struct {
Text string `xml:",chardata"` // Feuilleton, Medicinal - B...
} `xml:"subject"`
} `xml:"subj-group"`
} `xml:"article-categories"`
TitleGroup struct {
Text string `xml:",chardata"`
ArticleTitle struct {
Text string `xml:",chardata"` // Weitere Beobachtungen üb...
Lang string `xml:"lang,attr"`
} `xml:"article-title"`
TransTitleGroup struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
TransTitle struct {
Text string `xml:",chardata"` // Teonanacatl and Psilocybi...
Lang string `xml:"lang,attr"`
} `xml:"trans-title"`
} `xml:"trans-title-group"`
} `xml:"title-group"`
ContribGroup struct {
Text string `xml:",chardata"`
Contrib []struct {
Text string `xml:",chardata"`
Name struct {
Text string `xml:",chardata"`
Surname struct {
Text string `xml:",chardata"` // Riess, Freyer, Riess, Auf...
} `xml:"surname"`
GivenNames struct {
Text string `xml:",chardata"` // L., T., L., E., E., v., C...
} `xml:"given-names"`
Suffix struct {
Text string `xml:",chardata"` // Sir, Sir, Sir, Sir, Sir, ...
} `xml:"suffix"`
} `xml:"name"`
Aff struct {
Text string `xml:",chardata"`
Institution struct {
Text string `xml:",chardata"` // I. Aus der inneren Abthei...
} `xml:"institution"`
} `xml:"aff"`
Collab struct {
Text string `xml:",chardata"` // for the Eunice Kennedy Sh...
} `xml:"collab"`
} `xml:"contrib"`
} `xml:"contrib-group"`
PubDate struct {
Text string `xml:",chardata"`
PubType string `xml:"pub-type,attr"`
Month struct {
Text string `xml:",chardata"` // 12, 12, 12, 12, 12, 12, 1...
} `xml:"month"`
Year struct {
Text string `xml:",chardata"` // 1879, 1879, 1879, 1879, 1...
} `xml:"year"`
Day struct {
Text string `xml:",chardata"` // 31, 31, 31, 31, 31, 31, 3...
} `xml:"day"`
} `xml:"pub-date"`
Volume struct {
Text string `xml:",chardata"` // 5, 5, 5, 5, 5, 5, 5, 5, 5...
} `xml:"volume"`
Issue struct {
Text string `xml:",chardata"` // 52, 52, 52, 52, 52, 52, 5...
} `xml:"issue"`
Fpage struct {
Text string `xml:",chardata"` // 663, 667, 667, 669, 674, ...
} `xml:"fpage"`
Lpage struct {
Text string `xml:",chardata"` // 667, 667, 669, 669, 674, ...
} `xml:"lpage"`
Abstract struct {
Text string `xml:",innerxml"`
Lang string `xml:"lang,attr"`
} `xml:"abstract"`
TransAbstract struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
P struct {
Text string `xml:",chardata"` // Die zweite von Tschernogu...
} `xml:"p"`
} `xml:"trans-abstract"`
KwdGroup []struct {
Text string `xml:",chardata"`
Lang string `xml:"lang,attr"`
Kwd []struct {
Text string `xml:",chardata"` // Intra-operative vascular ...
} `xml:"kwd"`
} `xml:"kwd-group"`
Supplement struct {
Text string `xml:",chardata"` // S 01, S 01, S 01, S 01, S...
} `xml:"supplement"`
} `xml:"article-meta"`
} `xml:"front"`
} `xml:"article"`
} `xml:"metadata"`
About struct {
Text string `xml:",chardata"`
} `xml:"about"`
}
// Date returns the parsed publishing date.
func (record Record) Date() (time.Time, error) {
if len(record.Metadata.Article) == 0 {
return time.Time{}, fmt.Errorf("empty record")
}
article := record.Metadata.Article[0]
pd := article.Front.ArticleMeta.PubDate
if pd.Month.Text == "0" {
pd.Month.Text = "01"
}
if pd.Day.Text == "0" {
pd.Day.Text = "01"
}
if pd.Year.Text != "" && pd.Month.Text != "" && pd.Day.Text != "" {
s := fmt.Sprintf("%s-%s-%s", leftPad(pd.Year.Text, "0", 4),
leftPad(pd.Month.Text, "0", 2),
leftPad(pd.Day.Text, "0", 2))
return time.Parse("2006-01-02", s)
}
if pd.Year.Text != "" && pd.Month.Text != "" {
s := fmt.Sprintf("%s-%s-01", leftPad(pd.Year.Text, "0", 4), leftPad(pd.Month.Text, "0", 2))
return time.Parse("2006-01-02", s)
}
if pd.Year.Text != "" {
s := fmt.Sprintf("%s-01-01", leftPad(pd.Year.Text, "0", 4))
return time.Parse("2006-01-02", s)
}
return time.Time{}, fmt.Errorf("invalid date")
}
// ToIntermediateSchema converts a single record.
func (record Record) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
output := finc.NewIntermediateSchema()
if len(record.Metadata.Article) == 0 {
return nil, span.Skip{Reason: "no article found"}
}
article := record.Metadata.Article[0]
date, err := record.Date()
if err != nil {
return output, span.Skip{Reason: err.Error()}
}
output.Date = date
output.RawDate = date.Format("2006-01-02")
output.SourceID = SourceID
output.Format = Format
output.MegaCollections = []string{Collection, "sid-60-col-thiemejournals"}
output.Genre = Genre
output.RefType = DefaultRefType
output.JournalTitle = article.Front.JournalMeta.JournalTitleGroup.JournalTitle.Text
output.ArticleTitle = article.Front.ArticleMeta.TitleGroup.ArticleTitle.Text
output.StartPage = article.Front.ArticleMeta.Fpage.Text
output.EndPage = article.Front.ArticleMeta.Lpage.Text
output.Volume = article.Front.ArticleMeta.Volume.Text
output.Issue = article.Front.ArticleMeta.Issue.Text
output.Abstract = sanitize.HTML(article.Front.ArticleMeta.Abstract.Text)
// Partial publisher name consolidation, https://git.io/vxSFX.
pmap := map[string]string{
"© Georg Thieme Verlag KG": "Georg Thieme Verlag Stuttgart, New York",
"© Georg Thieme Verlag KG": "Georg Thieme Verlag Stuttgart, New York",
"© Georg Thieme Verlag": "Georg Thieme Verlag Stuttgart, New York",
"Georg Thieme Verlag KG": "Georg Thieme Verlag Stuttgart, New York",
"Georg Thieme Verlag, Stuttgart": "Georg Thieme Verlag Stuttgart, New York",
"Thieme Publicações Ltda": "Georg Thieme Verlag Stuttgart, New York",
}
// Clean publisher.
publisher := article.Front.JournalMeta.Publisher.PublisherName.Text
publisher = strings.Replace(publisher, "\n", " ", -1)
for k, v := range pmap {
if k == publisher {
publisher = v
}
}
if publisher == "" {
return output, span.Skip{Reason: "empty publisher string"}
}
output.Publishers = append(output.Publishers, publisher)
for _, issn := range article.Front.JournalMeta.ISSN {
switch issn.PubType {
case "print":
output.ISSN = append(output.ISSN, issn.Text)
case "e-issn":
output.EISSN = append(output.EISSN, issn.Text)
default:
return output, fmt.Errorf("unhandled issn type: %s", issn.PubType)
}
}
if article.Front.ArticleMeta.ArticleID.PubIDType == "doi" {
output.DOI = article.Front.ArticleMeta.ArticleID.Text
} else {
return output, fmt.Errorf("unknown id type: %s", article.Front.ArticleMeta.ArticleID.PubIDType)
}
output.RecordID = output.DOI
output.ID = fmt.Sprintf("ai-60-%s", base64.RawURLEncoding.EncodeToString([]byte(output.DOI)))
var authors []finc.Author
for _, contrib := range article.Front.ArticleMeta.ContribGroup.Contrib {
authors = append(authors, finc.Author{
FirstName: contrib.Name.GivenNames.Text,
LastName: contrib.Name.Surname.Text,
})
}
output.Authors = authors
subject := strings.TrimSpace(article.Front.ArticleMeta.ArticleCategories.SubjGroup.Subject.Text)
if subject != "" {
output.Subjects = append(output.Subjects, subject)
}
// refs #12965
if article.Lang != "" {
output.Languages = append(output.Languages, LanguageMap.Lookup(strings.ToUpper(article.Lang), "und"))
}
return output, nil
}