/
archivingfs.py
117 lines (98 loc) · 4.17 KB
/
archivingfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Datastore based on files written to the local filesystem, archived in gzipped
tar files, then retrieved from the tar files.
:copyright: Copyright 2006-2015 by the Sumatra team, see doc/authors.txt
:license: BSD 2-clause, see LICENSE for details.
"""
from __future__ import with_statement
from __future__ import unicode_literals
import os
import tarfile
import shutil
import logging
import mimetypes
import datetime
from contextlib import closing # needed for Python 2.6
from sumatra.core import TIMESTAMP_FORMAT, component
from .base import DataItem
from .filesystem import FileSystemDataStore
class ArchivedDataFile(DataItem):
"""A file-like object, that represents a file inside a tar archive"""
# current implementation just for real files
def __init__(self, path, store, creation=None):
self.path = path
archive_label = self.path.split(os.path.sep)[0]
self.tarfile_path = os.path.join(store.archive_store, archive_label + ".tar.gz")
info = self._get_info()
self.size = info.size
self.creation = creation or datetime.datetime.fromtimestamp(info.mtime).replace(microsecond=0)
self.name = os.path.basename(self.path)
self.extension = os.path.splitext(self.name)
self.mimetype, self.encoding = mimetypes.guess_type(self.path)
def _get_info(self):
with closing(tarfile.open(self.tarfile_path, 'r')) as data_archive:
info = data_archive.getmember(self.path)
return info
def get_content(self, max_length=None):
with closing(tarfile.open(self.tarfile_path, 'r')) as data_archive:
f = data_archive.extractfile(self.path)
if max_length:
content = f.read(max_length)
else:
content = f.read()
f.close()
return content
content = property(fget=get_content)
@property
def sorted_content(self):
raise NotImplementedError
@component
class ArchivingFileSystemDataStore(FileSystemDataStore):
"""
Represents a locally-mounted filesystem that archives any new files created
in it. The root of the data store will generally be a subdirectory of the
real filesystem.
"""
data_item_class = ArchivedDataFile
def __init__(self, root, archive=".smt/archive"):
super(ArchivingFileSystemDataStore, self).__init__(root)
self.archive_store = archive
# should allow specification of archive format, e.g. tar.gz or zip
def __getstate__(self):
return {'root': self.root, 'archive': self.archive_store}
def find_new_data(self, timestamp):
"""Finds newly created/changed data items"""
new_files = self._find_new_data_files(timestamp)
label = timestamp.strftime(TIMESTAMP_FORMAT)
archive_paths = self._archive(label, new_files)
return [ArchivedDataFile(path, self).generate_key()
for path in archive_paths]
def _archive(self, label, files, delete_originals=True):
"""
Archives files and, by default, deletes the originals.
"""
if not os.path.exists(self.archive_store):
os.mkdir(self.archive_store)
tf = tarfile.open(label + ".tar.gz",'w:gz')
logging.info("Archiving data to file %s" % tf.name)
# Add data files
archive_paths = []
for file_path in files:
archive_path = os.path.join(label, file_path)
tf.add(os.path.join(self.root, file_path), archive_path)
archive_paths.append(archive_path)
tf.close()
# Move the archive to self.archive_store
shutil.copy(tf.name, self.archive_store) # shutil.move() doesn't work as expected if dataroot is a symbolic link
os.remove(tf.name)
# Delete original files.
if delete_originals:
for file_path in files:
os.remove(os.path.join(self.root, file_path))
self._last_label = label # useful for testing
return archive_paths
def delete(self, *keys):
"""Delete the files corresponding to the given keys."""
raise NotImplementedError("Deletion of individual files not supported.")
def contains_path(self, path):
raise NotImplementedError