/
degruyter.py
77 lines (61 loc) · 2.19 KB
/
degruyter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# coding: utf-8
"""
DeGruyter task.
[degruyter]
ftp-host = host.name
ftp-username = username
ftp-password = password
ftp-path = /
ftp-pattern = some*glob*pattern.zip
"""
from gluish.benchmark import timed
from gluish.common import FTPMirror
from gluish.format import TSV
from gluish.intervals import daily
from gluish.parameter import ClosestDateParameter
from gluish.path import iterfiles
from gluish.utils import shellout
from siskin.configuration import Config
from siskin.task import DefaultTask
import datetime
import luigi
import re
import shutil
import tempfile
config = Config.instance()
class DegruyterTask(DefaultTask):
TAG = 'degruyter'
def closest(self):
return datetime.date(2015, 4, 1)
class DegruyterPaths(DegruyterTask):
""" A list of Degruyter ile paths (via FTP). """
date = ClosestDateParameter(default=datetime.date.today())
def requires(self):
host = config.get('degruyter', 'ftp-host')
username = config.get('degruyter', 'ftp-username')
password = config.get('degruyter', 'ftp-password')
base = config.get('degruyter', 'ftp-path')
pattern = config.get('degruyter', 'ftp-pattern')
return FTPMirror(host=host, username=username, password=password,
base=base, pattern=pattern)
@timed
def run(self):
self.input().move(self.output().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext="filelist"), format=TSV)
class DegruyterXML(DegruyterTask):
""" Extract all XML files from Jstor dump. """
date = ClosestDateParameter(default=datetime.date.today())
def requires(self):
return DegruyterPaths(date=self.date)
@timed
def run(self):
_, stopover = tempfile.mkstemp(prefix='siskin-')
with self.input().open() as handle:
for row in handle.iter_tsv(cols=('path',)):
if not '/SSH/' in row.path:
continue
shellout("unzip -p {path} \*.xml 2> /dev/null >> {output}", output=stopover, path=row.path, ignoremap={1: 'OK'})
luigi.File(stopover).move(self.output().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext='xml'), format=TSV)