Skip to content

Commit

Permalink
Compare the hash of files with identical mtime/size #5589
Browse files Browse the repository at this point in the history
* For conflicts where mtime and size are identical:

  a) If there's no remote checksum, skip (unchanged)
  b) If there's a remote checksum that's a useful hash, create a
     PropagateDownload job and compute the local hash. If the hashes
     are identical, don't download the file and just update metadata.

* Avoid exposing the existence of checksumTypeId beyond the database
  layer. This makes handling checksums easier in general because they
  can usually be treated as a single blob.

  This change was prompted by the difficulty of producing file_stat_t
  entries uniformly from PROPFINDs and the database.
  • Loading branch information
ckamm committed Jun 15, 2017
1 parent d50d8b8 commit 8160963
Show file tree
Hide file tree
Showing 25 changed files with 345 additions and 128 deletions.
5 changes: 2 additions & 3 deletions csync/src/csync.c
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,7 @@ static int _csync_treewalk_visitor(void *obj, void *data) {

trav.error_status = cur->error_status;
trav.has_ignored_files = cur->has_ignored_files;
trav.checksum = cur->checksum;
trav.checksumTypeId = cur->checksumTypeId;
trav.checksumHeader = cur->checksumHeader;

if( other_node ) {
csync_file_stat_t *other_stat = (csync_file_stat_t*)other_node->data;
Expand Down Expand Up @@ -670,7 +669,7 @@ void csync_file_stat_free(csync_file_stat_t *st)
SAFE_FREE(st->directDownloadCookies);
SAFE_FREE(st->etag);
SAFE_FREE(st->destpath);
SAFE_FREE(st->checksum);
SAFE_FREE(st->checksumHeader);
SAFE_FREE(st);
}
}
11 changes: 7 additions & 4 deletions csync/src/csync.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@ struct csync_vio_file_stat_s {
enum csync_vio_file_flags_e flags;

char *original_name; // only set if locale conversion fails

// For remote file stats: the highest quality checksum the server provided
// in the "SHA1:324315da2143" form.
char *checksumHeader;
};

csync_vio_file_stat_t OCSYNC_EXPORT *csync_vio_file_stat_new(void);
Expand Down Expand Up @@ -262,8 +266,7 @@ struct csync_tree_walk_file_s {
char *directDownloadUrl;
char *directDownloadCookies;

const char *checksum;
uint32_t checksumTypeId;
const char *checksumHeader;

struct {
int64_t size;
Expand Down Expand Up @@ -304,8 +307,8 @@ typedef int (*csync_vio_stat_hook) (csync_vio_handle_t *dhhandle,
void *userdata);

/* Compute the checksum of the given \a checksumTypeId for \a path. */
typedef const char* (*csync_checksum_hook) (
const char *path, uint32_t checksumTypeId, void *userdata);
typedef const char *(*csync_checksum_hook)(
const char *path, const char *otherChecksumHeader, void *userdata);

/**
* @brief Allocate a csync context.
Expand Down
7 changes: 5 additions & 2 deletions csync/src/csync_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,11 @@ struct csync_file_stat_s {
char *directDownloadCookies;
char remotePerm[REMOTE_PERM_BUF_SIZE+1];

const char *checksum;
uint32_t checksumTypeId;
// In the local tree, this can hold a checksum and its type if it is
// computed during discovery for some reason.
// In the remote tree, this will have the server checksum, if available.
// In both cases, the format is "SHA1:baff".
const char *checksumHeader;

CSYNC_STATUS error_status;

Expand Down
41 changes: 37 additions & 4 deletions csync/src/csync_reconcile.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,19 @@ static c_rbnode_t *_csync_check_ignored(c_rbtree_t *tree, const char *path, int
}
}

/* Returns true if we're reasonably certain that hash equality
* for the header means content equality.
*
* Cryptographic safety is not required - this is mainly
* intended to rule out checksums like Adler32 that are not intended for
* hashing and have high likelihood of collision with particular inputs.
*/
static bool _csync_is_collision_safe_hash(const char *checksum_header)
{
return strncmp(checksum_header, "SHA1:", 5) == 0
|| strncmp(checksum_header, "MD5:", 4) == 0;
}

/**
* The main function in the reconcile pass.
*
Expand Down Expand Up @@ -272,11 +285,31 @@ static int _csync_merge_algorithm_visitor(void *obj, void *data) {
// Folders of the same path are always considered equals
is_conflict = false;
} else {
// If the size or mtime is different, it's definitely a conflict.
is_conflict = ((other->size != cur->size) || (other->modtime != cur->modtime));
// FIXME: do a binary comparision of the file here because of the following
// edge case:
// The files could still have different content, even though the mtime
// and size are the same.

// It could be a conflict even if size and mtime match!
//
// In older client versions we always treated these cases as a
// non-conflict. This behavior is preserved in case the server
// doesn't provide a suitable content hash.
//
// When it does have one, however, we do create a job, but the job
// will compare hashes and avoid the download if they are equal.
const char *remoteChecksumHeader =
(ctx->current == REMOTE_REPLICA ? cur->checksumHeader : other->checksumHeader);
if (remoteChecksumHeader) {
is_conflict |= _csync_is_collision_safe_hash(remoteChecksumHeader);
}

// SO: If there is no checksum, we can have !is_conflict here
// even though the files have different content! This is an
// intentional tradeoff. Downloading and comparing files would
// be technically correct in this situation but leads to too
// much waste.
// In particular this kind of NEW/NEW situation with identical
// sizes and mtimes pops up when the local database is lost for
// whatever reason.
}
if (ctx->current == REMOTE_REPLICA) {
// If the files are considered equal, only update the DB with the etag from remote
Expand Down
20 changes: 12 additions & 8 deletions csync/src/csync_statedb.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,12 @@ int csync_statedb_close(CSYNC *ctx) {
return rc;
}

#define METADATA_COLUMNS "phash, pathlen, path, inode, uid, gid, mode, modtime, type, md5, fileid, remotePerm, filesize, ignoredChildrenRemote, contentChecksum, contentChecksumTypeId"
#define METADATA_QUERY \
"phash, pathlen, path, inode, uid, gid, mode, modtime, type, md5, fileid, remotePerm, " \
"filesize, ignoredChildrenRemote, " \
"contentchecksumtype.name || ':' || contentChecksum " \
"FROM metadata " \
"LEFT JOIN checksumtype as contentchecksumtype ON metadata.contentChecksumTypeId == contentchecksumtype.id"

// This funciton parses a line from the metadata table into the given csync_file_stat
// structure which it is also allocating.
Expand Down Expand Up @@ -287,9 +292,8 @@ static int _csync_file_stat_from_metadata_table( csync_file_stat_t **st, sqlite3
if(column_count > 13) {
(*st)->has_ignored_files = sqlite3_column_int(stmt, 13);
}
if(column_count > 15 && sqlite3_column_int(stmt, 15)) {
(*st)->checksum = c_strdup( (char*) sqlite3_column_text(stmt, 14));
(*st)->checksumTypeId = sqlite3_column_int(stmt, 15);
if (column_count > 14 && sqlite3_column_text(stmt, 14)) {
(*st)->checksumHeader = c_strdup((char *)sqlite3_column_text(stmt, 14));
}

}
Expand All @@ -313,7 +317,7 @@ csync_file_stat_t *csync_statedb_get_stat_by_hash(CSYNC *ctx,
}

if( ctx->statedb.by_hash_stmt == NULL ) {
const char *hash_query = "SELECT " METADATA_COLUMNS " FROM metadata WHERE phash=?1";
const char *hash_query = "SELECT " METADATA_QUERY " WHERE phash=?1";

SQLITE_BUSY_HANDLED(sqlite3_prepare_v2(ctx->statedb.db, hash_query, strlen(hash_query), &ctx->statedb.by_hash_stmt, NULL));
ctx->statedb.lastReturnValue = rc;
Expand Down Expand Up @@ -356,7 +360,7 @@ csync_file_stat_t *csync_statedb_get_stat_by_file_id(CSYNC *ctx,
}

if( ctx->statedb.by_fileid_stmt == NULL ) {
const char *query = "SELECT " METADATA_COLUMNS " FROM metadata WHERE fileid=?1";
const char *query = "SELECT " METADATA_QUERY " WHERE fileid=?1";

SQLITE_BUSY_HANDLED(sqlite3_prepare_v2(ctx->statedb.db, query, strlen(query), &ctx->statedb.by_fileid_stmt, NULL));
ctx->statedb.lastReturnValue = rc;
Expand Down Expand Up @@ -396,7 +400,7 @@ csync_file_stat_t *csync_statedb_get_stat_by_inode(CSYNC *ctx,
}

if( ctx->statedb.by_inode_stmt == NULL ) {
const char *inode_query = "SELECT " METADATA_COLUMNS " FROM metadata WHERE inode=?1";
const char *inode_query = "SELECT " METADATA_QUERY " WHERE inode=?1";

SQLITE_BUSY_HANDLED(sqlite3_prepare_v2(ctx->statedb.db, inode_query, strlen(inode_query), &ctx->statedb.by_inode_stmt, NULL));
ctx->statedb.lastReturnValue = rc;
Expand Down Expand Up @@ -439,7 +443,7 @@ int csync_statedb_get_below_path( CSYNC *ctx, const char *path ) {
* In other words, anything that is between path+'/' and path+'0',
* (because '0' follows '/' in ascii)
*/
const char *below_path_query = "SELECT " METADATA_COLUMNS " FROM metadata WHERE path > (?||'/') AND path < (?||'0') ORDER BY path||'/' ASC";
const char *below_path_query = "SELECT " METADATA_QUERY " WHERE path > (?||'/') AND path < (?||'0') ORDER BY path||'/' ASC";
SQLITE_BUSY_HANDLED(sqlite3_prepare_v2(ctx->statedb.db, below_path_query, -1, &stmt, NULL));
ctx->statedb.lastReturnValue = rc;
if( rc != SQLITE_OK ) {
Expand Down
35 changes: 19 additions & 16 deletions csync/src/csync_update.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,16 +287,15 @@ static int _csync_detect_update(CSYNC *ctx, const char *file,
// Checksum comparison at this stage is only enabled for .eml files,
// check #4754 #4755
bool isEmlFile = csync_fnmatch("*.eml", file, FNM_CASEFOLD) == 0;
if (isEmlFile && fs->size == tmp->size && tmp->checksumTypeId) {
if (isEmlFile && fs->size == tmp->size && tmp->checksumHeader) {
if (ctx->callbacks.checksum_hook) {
st->checksum = ctx->callbacks.checksum_hook(
file, tmp->checksumTypeId,
ctx->callbacks.checksum_userdata);
st->checksumHeader = ctx->callbacks.checksum_hook(
file, tmp->checksumHeader,
ctx->callbacks.checksum_userdata);
}
bool checksumIdentical = false;
if (st->checksum) {
st->checksumTypeId = tmp->checksumTypeId;
checksumIdentical = strncmp(st->checksum, tmp->checksum, 1000) == 0;
if (st->checksumHeader) {
checksumIdentical = strncmp(st->checksumHeader, tmp->checksumHeader, 1000) == 0;
}
if (checksumIdentical) {
CSYNC_LOG(CSYNC_LOG_PRIORITY_TRACE, "NOTE: Checksums are identical, file did not actually change: %s", path);
Expand Down Expand Up @@ -381,15 +380,14 @@ static int _csync_detect_update(CSYNC *ctx, const char *file,


// Verify the checksum where possible
if (isRename && tmp->checksumTypeId && ctx->callbacks.checksum_hook
&& fs->type == CSYNC_VIO_FILE_TYPE_REGULAR) {
st->checksum = ctx->callbacks.checksum_hook(
file, tmp->checksumTypeId,
ctx->callbacks.checksum_userdata);
if (st->checksum) {
CSYNC_LOG(CSYNC_LOG_PRIORITY_TRACE, "checking checksum of potential rename %s %s <-> %s", path, st->checksum, tmp->checksum);
st->checksumTypeId = tmp->checksumTypeId;
isRename = strncmp(st->checksum, tmp->checksum, 1000) == 0;
if (isRename && tmp->checksumHeader && ctx->callbacks.checksum_hook
&& fs->type == CSYNC_VIO_FILE_TYPE_REGULAR) {
st->checksumHeader = ctx->callbacks.checksum_hook(
file, tmp->checksumHeader,
ctx->callbacks.checksum_userdata);
if (st->checksumHeader) {
CSYNC_LOG(CSYNC_LOG_PRIORITY_TRACE, "checking checksum of potential rename %s %s <-> %s", path, st->checksumHeader, tmp->checksumHeader);
isRename = strncmp(st->checksumHeader, tmp->checksumHeader, 1000) == 0;
}
}

Expand Down Expand Up @@ -506,6 +504,11 @@ static int _csync_detect_update(CSYNC *ctx, const char *file,
strncpy(st->remotePerm, fs->remotePerm, REMOTE_PERM_BUF_SIZE);
}

// For the remote: propagate the discovered checksum
if (fs->checksumHeader && ctx->current == REMOTE_REPLICA) {
st->checksumHeader = c_strdup(fs->checksumHeader);
}

st->phash = h;
st->pathlen = len;
memcpy(st->path, (len ? path : ""), len + 1);
Expand Down
4 changes: 4 additions & 0 deletions csync/src/vio/csync_vio_file_stat.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ csync_vio_file_stat_t* csync_vio_file_stat_copy(csync_vio_file_stat_t *file_stat
if (file_stat_cpy->directDownloadUrl) {
file_stat_cpy->directDownloadUrl = c_strdup(file_stat_cpy->directDownloadUrl);
}
if (file_stat_cpy->checksumHeader) {
file_stat_cpy->checksumHeader = c_strdup(file_stat_cpy->checksumHeader);
}
file_stat_cpy->name = c_strdup(file_stat_cpy->name);
return file_stat_cpy;
}
Expand All @@ -57,6 +60,7 @@ void csync_vio_file_stat_destroy(csync_vio_file_stat_t *file_stat) {
SAFE_FREE(file_stat->directDownloadCookies);
SAFE_FREE(file_stat->name);
SAFE_FREE(file_stat->original_name);
SAFE_FREE(file_stat->checksumHeader);
SAFE_FREE(file_stat);
}

Expand Down
48 changes: 24 additions & 24 deletions src/libsync/checksums.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ Q_LOGGING_CATEGORY(lcChecksums, "sync.checksums", QtInfoMsg)

QByteArray makeChecksumHeader(const QByteArray &checksumType, const QByteArray &checksum)
{
if (checksumType.isEmpty() || checksum.isEmpty())
return QByteArray();
QByteArray header = checksumType;
header.append(':');
header.append(checksum);
Expand All @@ -105,6 +107,16 @@ bool parseChecksumHeader(const QByteArray &header, QByteArray *type, QByteArray
return true;
}


QByteArray parseChecksumHeaderType(const QByteArray &header)
{
const auto idx = header.indexOf(':');
if (idx < 0) {
return QByteArray();
}
return header.left(idx);
}

bool uploadChecksumEnabled()
{
static bool enabled = qgetenv("OWNCLOUD_DISABLE_CHECKSUM_UPLOAD").isEmpty();
Expand Down Expand Up @@ -214,39 +226,27 @@ void ValidateChecksumHeader::slotChecksumCalculated(const QByteArray &checksumTy
emit validated(checksumType, checksum);
}

CSyncChecksumHook::CSyncChecksumHook(SyncJournalDb *journal)
: _journal(journal)
CSyncChecksumHook::CSyncChecksumHook()
{
}

const char *CSyncChecksumHook::hook(const char *path, uint32_t checksumTypeId, void *this_obj)
const char *CSyncChecksumHook::hook(const char *path, const char *otherChecksumHeader, void * /*this_obj*/)
{
CSyncChecksumHook *checksumHook = static_cast<CSyncChecksumHook *>(this_obj);
QByteArray checksum = checksumHook->compute(QString::fromUtf8(path), checksumTypeId);
QByteArray type = parseChecksumHeaderType(QByteArray(otherChecksumHeader));
if (type.isEmpty())
return NULL;

QByteArray checksum = ComputeChecksum::computeNow(path, type);
if (checksum.isNull()) {
qCWarning(lcChecksums) << "Failed to compute checksum" << type << "for" << path;
return NULL;
}

char *result = (char *)malloc(checksum.size() + 1);
memcpy(result, checksum.constData(), checksum.size());
result[checksum.size()] = 0;
QByteArray checksumHeader = makeChecksumHeader(type, checksum);
char *result = (char *)malloc(checksumHeader.size() + 1);
memcpy(result, checksumHeader.constData(), checksumHeader.size());
result[checksumHeader.size()] = 0;
return result;
}

QByteArray CSyncChecksumHook::compute(const QString &path, int checksumTypeId)
{
QByteArray checksumType = _journal->getChecksumType(checksumTypeId);
if (checksumType.isEmpty()) {
qCWarning(lcChecksums) << "Checksum type" << checksumTypeId << "not found";
return QByteArray();
}

QByteArray checksum = ComputeChecksum::computeNow(path, checksumType);
if (checksum.isNull()) {
qCWarning(lcChecksums) << "Failed to compute checksum" << checksumType << "for" << path;
return QByteArray();
}

return checksum;
}
}
14 changes: 6 additions & 8 deletions src/libsync/checksums.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ QByteArray makeChecksumHeader(const QByteArray &checksumType, const QByteArray &
/// Parses a checksum header
bool parseChecksumHeader(const QByteArray &header, QByteArray *type, QByteArray *checksum);

/// Convenience for getting the type from a checksum header, null if none
QByteArray parseChecksumHeaderType(const QByteArray &header);

/// Checks OWNCLOUD_DISABLE_CHECKSUM_UPLOAD
bool uploadChecksumEnabled();

Expand Down Expand Up @@ -119,20 +122,15 @@ class OWNCLOUDSYNC_EXPORT CSyncChecksumHook : public QObject
{
Q_OBJECT
public:
explicit CSyncChecksumHook(SyncJournalDb *journal);
explicit CSyncChecksumHook();

/**
* Returns the checksum value for \a path for the given \a checksumTypeId.
* Returns the checksum value for \a path that is comparable to \a otherChecksum.
*
* Called from csync, where a instance of CSyncChecksumHook has
* to be set as userdata.
* The return value will be owned by csync.
*/
static const char *hook(const char *path, uint32_t checksumTypeId, void *this_obj);

QByteArray compute(const QString &path, int checksumTypeId);

private:
SyncJournalDb *_journal;
static const char *hook(const char *path, const char *otherChecksumHeader, void *this_obj);
};
}

0 comments on commit 8160963

Please sign in to comment.