Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LibraryScanner: Improve hashing of directory contents #2497

Merged
merged 7 commits into from
Feb 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ add_library(mixxx-lib STATIC EXCLUDE_FROM_ALL
src/util/audiosignal.cpp
src/util/autohidpi.cpp
src/util/battery/battery.cpp
src/util/cache.cpp
src/util/cmdlineargs.cpp
src/util/color/color.cpp
src/util/color/predefinedcolor.cpp
Expand Down Expand Up @@ -913,6 +914,7 @@ add_executable(mixxx-test
src/test/bpmcontrol_test.cpp
src/test/broadcastprofile_test.cpp
src/test/broadcastsettings_test.cpp
src/test/cache_test.cpp
src/test/channelhandle_test.cpp
src/test/configobject_test.cpp
src/test/controller_preset_validation_test.cpp
Expand Down
1 change: 1 addition & 0 deletions build/depends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,7 @@ def sources(self, build):
"src/util/xml.cpp",
"src/util/tapfilter.cpp",
"src/util/movinginterquartilemean.cpp",
"src/util/cache.cpp",
"src/util/console.cpp",
"src/util/color/color.cpp",
"src/util/db/dbconnection.cpp",
Expand Down
33 changes: 22 additions & 11 deletions src/library/dao/libraryhashdao.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,21 @@
#include "libraryhashdao.h"
#include "library/queryutil.h"

QHash<QString, int> LibraryHashDAO::getDirectoryHashes() {
namespace {

// Store hash values as a signed 64-bit integer. Otherwise values greater
// than 2^63-1 would be converted into a floating point numbers while
// losing precision!!
inline constexpr mixxx::cache_key_signed_t dbHash(mixxx::cache_key_t hash) {
return mixxx::signedCacheKey(hash);
}

} // anonymous namespace

QHash<QString, mixxx::cache_key_t> LibraryHashDAO::getDirectoryHashes() {
QSqlQuery query(m_database);
query.prepare("SELECT hash, directory_path FROM LibraryHashes");
QHash<QString, int> hashes;
QHash<QString, mixxx::cache_key_t> hashes;
if (!query.exec()) {
LOG_FAILED_QUERY(query);
}
Expand All @@ -20,15 +31,15 @@ QHash<QString, int> LibraryHashDAO::getDirectoryHashes() {
int directoryPathColumn = query.record().indexOf("directory_path");
while (query.next()) {
hashes[query.value(directoryPathColumn).toString()] =
query.value(hashColumn).toInt();
query.value(hashColumn).toULongLong();
}

return hashes;
}

int LibraryHashDAO::getDirectoryHash(const QString& dirPath) {
mixxx::cache_key_t LibraryHashDAO::getDirectoryHash(const QString& dirPath) {
//qDebug() << "LibraryHashDAO::getDirectoryHash" << QThread::currentThread() << m_database.connectionName();
int hash = -1;
mixxx::cache_key_t hash = mixxx::invalidCacheKey();

QSqlQuery query(m_database);
query.prepare("SELECT hash FROM LibraryHashes "
Expand All @@ -40,7 +51,7 @@ int LibraryHashDAO::getDirectoryHash(const QString& dirPath) {
}
//Grab a hash for this directory from the database, from the last time it was scanned.
if (query.next()) {
hash = query.value(query.record().indexOf("hash")).toInt();
hash = query.value(query.record().indexOf("hash")).toULongLong();
//qDebug() << "prev hash exists" << hash << dirPath;
} else {
//qDebug() << "prev hash does not exist" << dirPath;
Expand All @@ -49,13 +60,13 @@ int LibraryHashDAO::getDirectoryHash(const QString& dirPath) {
return hash;
}

void LibraryHashDAO::saveDirectoryHash(const QString& dirPath, const int hash) {
void LibraryHashDAO::saveDirectoryHash(const QString& dirPath, mixxx::cache_key_t hash) {
//qDebug() << "LibraryHashDAO::saveDirectoryHash" << QThread::currentThread() << m_database.connectionName();
QSqlQuery query(m_database);
query.prepare("INSERT INTO LibraryHashes (directory_path, hash, directory_deleted) "
"VALUES (:directory_path, :hash, :directory_deleted)");
query.bindValue(":directory_path", dirPath);
query.bindValue(":hash", hash);
query.bindValue(":hash", dbHash(hash));
query.bindValue(":directory_deleted", 0);

if (!query.exec()) {
Expand All @@ -65,8 +76,8 @@ void LibraryHashDAO::saveDirectoryHash(const QString& dirPath, const int hash) {
}

void LibraryHashDAO::updateDirectoryHash(const QString& dirPath,
const int newHash,
const int dir_deleted) {
mixxx::cache_key_t newHash,
int dir_deleted) {
//qDebug() << "LibraryHashDAO::updateDirectoryHash" << QThread::currentThread() << m_database.connectionName();
QSqlQuery query(m_database);
// By definition if we have calculated a new hash for a directory then it
Expand All @@ -75,7 +86,7 @@ void LibraryHashDAO::updateDirectoryHash(const QString& dirPath,
"SET hash=:hash, directory_deleted=:directory_deleted, "
"needs_verification=0 "
"WHERE directory_path=:directory_path");
query.bindValue(":hash", newHash);
query.bindValue(":hash", dbHash(newHash));
query.bindValue(":directory_deleted", dir_deleted);
query.bindValue(":directory_path", dirPath);

Expand Down
11 changes: 6 additions & 5 deletions src/library/dao/libraryhashdao.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <QSqlDatabase>

#include "library/dao/dao.h"
#include "util/cache.h"

class LibraryHashDAO : public DAO {
public:
Expand All @@ -16,11 +17,11 @@ class LibraryHashDAO : public DAO {
m_database = database;
};

QHash<QString, int> getDirectoryHashes();
int getDirectoryHash(const QString& dirPath);
void saveDirectoryHash(const QString& dirPath, const int hash);
void updateDirectoryHash(const QString& dirPath, const int newHash,
const int dir_deleted);
QHash<QString, mixxx::cache_key_t> getDirectoryHashes();
mixxx::cache_key_t getDirectoryHash(const QString& dirPath);
void saveDirectoryHash(const QString& dirPath, mixxx::cache_key_t hash);
void updateDirectoryHash(const QString& dirPath, mixxx::cache_key_t newHash,
int dir_deleted);
void markAsExisting(const QString& dirPath);
void invalidateAllDirectories();
void markUnverifiedDirectoriesAsDeleted();
Expand Down
2 changes: 1 addition & 1 deletion src/library/scanner/importfilestask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ImportFilesTask::ImportFilesTask(LibraryScanner* pScanner,
const ScannerGlobalPointer scannerGlobal,
const QString& dirPath,
const bool prevHashExists,
const int newHash,
const mixxx::cache_key_t newHash,
const QLinkedList<QFileInfo>& filesToImport,
const QLinkedList<QFileInfo>& possibleCovers,
SecurityTokenPointer pToken)
Expand Down
5 changes: 2 additions & 3 deletions src/library/scanner/importfilestask.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

#include "util/sandbox.h"
#include "library/scanner/scannertask.h"
#include "library/scanner/scannerglobal.h"

// Import the provided files. Successful if the scan completed without being
// cancelled. False if the scan was cancelled part-way through.
Expand All @@ -17,7 +16,7 @@ class ImportFilesTask : public ScannerTask {
const ScannerGlobalPointer scannerGlobal,
const QString& dirPath,
const bool prevHashExists,
const int newHash,
const mixxx::cache_key_t newHash,
const QLinkedList<QFileInfo>& filesToImport,
const QLinkedList<QFileInfo>& possibleCovers,
SecurityTokenPointer pToken);
Expand All @@ -28,7 +27,7 @@ class ImportFilesTask : public ScannerTask {
private:
const QString m_dirPath;
const bool m_prevHashExists;
const int m_newHash;
const mixxx::cache_key_t m_newHash;
const QLinkedList<QFileInfo> m_filesToImport;
const QLinkedList<QFileInfo> m_possibleCovers;
SecurityTokenPointer m_pToken;
Expand Down
24 changes: 14 additions & 10 deletions src/library/scanner/libraryscanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,8 @@ mixxx::Logger kLogger("LibraryScanner");

QAtomicInt s_instanceCounter(0);

const QString kDeleteOrphanedLibraryScannerDirectories =
"delete from LibraryHashes where hash <> 0 and directory_path not in (select directory from track_locations)";

// Returns the number of affected rows or -1 on error
int execCleanupQuery(QSqlDatabase database, const QString& statement) {
FwdSqlQuery query(database, statement);
int execCleanupQuery(FwdSqlQuery& query) {
VERIFY_OR_DEBUG_ASSERT(query.isPrepared()) {
return -1;
}
Expand Down Expand Up @@ -142,11 +138,19 @@ void LibraryScanner::run() {
// Clean up the database and fix inconsistencies from previous runs.
// See also: https://bugs.launchpad.net/mixxx/+bug/1846945
{
kLogger.info() << "Cleaning up database...";
kLogger.info()
<< "Cleaning up database...";
PerformanceTimer timer;
timer.start();
auto numRows = execCleanupQuery(dbConnection,
kDeleteOrphanedLibraryScannerDirectories);
const auto sqlStmt = QStringLiteral(
"DELETE FROM LibraryHashes WHERE hash <> :unequalHash "
"AND directory_path NOT IN "
"(SELECT directory FROM track_locations)");
FwdSqlQuery query(dbConnection, sqlStmt);
query.bindValue(
QStringLiteral(":unequalHash"),
static_cast<mixxx::cache_key_signed_t>(mixxx::invalidCacheKey()));
auto numRows = execCleanupQuery(query);
if (numRows < 0) {
kLogger.warning()
<< "Failed to delete orphaned directory hashes";
Expand Down Expand Up @@ -189,7 +193,7 @@ void LibraryScanner::slotStartScan() {
changeScannerState(SCANNING);

QSet<QString> trackLocations = m_trackDao.getTrackLocations();
QHash<QString, int> directoryHashes = m_libraryHashDao.getDirectoryHashes();
QHash<QString, mixxx::cache_key_t> directoryHashes = m_libraryHashDao.getDirectoryHashes();
QRegExp extensionFilter(SoundSourceProxy::getSupportedFileNamesRegex());
QRegExp coverExtensionFilter =
QRegExp(CoverArtUtils::supportedCoverArtExtensionsRegex(),
Expand Down Expand Up @@ -515,7 +519,7 @@ void LibraryScanner::queueTask(ScannerTask* pTask) {
}

void LibraryScanner::slotDirectoryHashedAndScanned(const QString& directoryPath,
bool newDirectory, int hash) {
bool newDirectory, mixxx::cache_key_t hash) {
ScopedTimer timer("LibraryScanner::slotDirectoryHashedAndScanned");
//kLogger.debug() << "sloDirectoryHashedAndScanned" << directoryPath
// << newDirectory << hash;
Expand Down
2 changes: 1 addition & 1 deletion src/library/scanner/libraryscanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class LibraryScanner : public QThread {

// ScannerTask signal handlers.
void slotDirectoryHashedAndScanned(const QString& directoryPath,
bool newDirectory, int hash);
bool newDirectory, mixxx::cache_key_t hash);
void slotDirectoryUnchanged(const QString& directoryPath);
void slotTrackExists(const QString& trackPath);
void slotAddNewTrack(const QString& trackPath);
Expand Down
12 changes: 7 additions & 5 deletions src/library/scanner/recursivescandirectorytask.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <QCryptographicHash>
#include <QDirIterator>

#include "library/scanner/recursivescandirectorytask.h"
Expand Down Expand Up @@ -38,7 +39,8 @@ void RecursiveScanDirectoryTask::run() {
QLinkedList<QFileInfo> filesToImport;
QLinkedList<QFileInfo> possibleCovers;
QLinkedList<QDir> dirsToScan;
QStringList newHashStr;

QCryptographicHash hasher(QCryptographicHash::Sha256);

// TODO(rryan) benchmark QRegExp copy versus QMutex/QRegExp in ScannerGlobal
// versus slicing the extension off and checking for set/list containment.
Expand All @@ -54,7 +56,7 @@ void RecursiveScanDirectoryTask::run() {
if (currentFileInfo.isFile()) {
const QString& fileName = currentFileInfo.fileName();
if (supportedExtensionsRegex.indexIn(fileName) != -1) {
newHashStr.append(currentFile);
hasher.addData(currentFile.toUtf8());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you made a performance check for toUtf8() compared to use the QString bytes?
I can Imagine that the later is faster.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raw bytes of QString are not platform-independent when considering endianess, but UTF-8 is. Does this matter?

Copy link
Contributor Author

@uklotzde uklotzde Feb 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no function to obtain the raw bytes of a QString. Using toLatin1() may lose characters and toLocal8Bit() is platform dependent while the database is portable. Please clarify which option you consider here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was just the idea to check if something like
hasher.addData(reinterprete_cast<char*>(currentFile.utf16()))
is faster or not.
This way there is the double amount to hash but no utf8 conversion.

I don't care about endianes, because this would be only apply if the DB is on an external HD, accessing the same OS with different CPU architectures.
Thinking of this a independent solution is not bad.

Copy link
Contributor Author

@uklotzde uklotzde Feb 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that the additional allocations really matter here and won't invest hours to figure out how to do a realistic performance test to prove this. The optimized solution would also produce non-portable data, in contrast to all other data in the database.

Copy link
Member

@Holzhaus Holzhaus Feb 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was just the idea to check if something like
hasher.addData(reinterprete_cast<char*>(currentFile.utf16()))
is faster or not.
This way there is the double amount to hash but no utf8 conversion

Double the data should not be an issue. sha256 runtime not increase 1:1 with the amount of input data. I'm on my phone, but you should see the same effect on desktop pcs:

$ openssl speed sha256
The 'numbers' are in 1000s of bytes per second processed.
type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes  16384 bytes
sha256          180714.92k   532643.80k  1088833.54k  1466255.70k  1633716.91k  1647209.13k

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The actual number of bytes that we feed into QCryptographicHash should not be an issue. But QString might need to allocate a temporary QByteArray for the UTF-8 representation and the conversion itself also takes some time.

Yes, utf16() would be faster, but it is non-portable. If someone is able to proof that it is much faster and worth the drawback, please go ahead ;) I will not do it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just out of curious I did the test. Here are the results:

10 x 30 characters
  6040 nsec hasher.addData(reinterpret_cast<const char*>(lorem.unicode()), lorem.size() * sizeof(QChar));
  8683 nsec hasher.addData(lorem.toUtf8());
 12836 nsec hasher.addData(lorem.toUtf8()); "ööööö.."

10 x 888 characters 
149882 nsec hasher.addData(reinterpret_cast<const char*>(lorem.unicode()), lorem.size() * sizeof(QChar));
 82302 nsec hasher.addData(lorem.toUtf8());
257479 nsec hasher.addData(lorem.toUtf8()); "ööööö.."

Conclusion:
As expected there is only performance gain for ASCII characters. Except in non Latin charter sets two byte utf8 characters are a minority.
For a typical file name of 30 charters, the unicode() version is way faster. If we append all strings first, the utf8 version catches up at one point.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those numbers also need to be put in relation to the file system access. I guess that the differences on the overall performance are negligible. That's what I had in mind with "realistic performance test".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes of cause. I was just interested if we have a low hanging fruit here.

filesToImport.append(currentFileInfo);
} else if (supportedCoverExtensionsRegex.indexIn(fileName) != -1) {
possibleCovers.append(currentFileInfo);
Expand All @@ -73,13 +75,13 @@ void RecursiveScanDirectoryTask::run() {

// Note: A hash of "0" is a real hash if the directory contains no files!
// Calculate a hash of the directory's file list.
int newHash = qHash(newHashStr.join(""));
const mixxx::cache_key_t newHash = mixxx::cacheKeyFromMessageDigest(hasher.result());

QString dirPath = m_dir.path();

// Try to retrieve a hash from the last time that directory was scanned.
int prevHash = m_scannerGlobal->directoryHashInDatabase(dirPath);
bool prevHashExists = prevHash != -1;
const mixxx::cache_key_t prevHash = m_scannerGlobal->directoryHashInDatabase(dirPath);
const bool prevHashExists = mixxx::isValidCacheKey(prevHash);

if (prevHashExists || m_scanUnhashed) {
// Compare the hashes, and if they don't match, rescan the files in that
Expand Down
7 changes: 4 additions & 3 deletions src/library/scanner/scannerglobal.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <QMutexLocker>
#include <QSharedPointer>

#include "util/cache.h"
#include "util/task.h"
#include "util/performancetimer.h"

Expand Down Expand Up @@ -37,7 +38,7 @@ class DirInfo {
class ScannerGlobal {
public:
ScannerGlobal(const QSet<QString>& trackLocations,
const QHash<QString, int>& directoryHashes,
const QHash<QString, mixxx::cache_key_t>& directoryHashes,
const QRegExp& supportedExtensionsMatcher,
const QRegExp& supportedCoverExtensionsMatcher,
const QStringList& directoriesBlacklist)
Expand All @@ -62,7 +63,7 @@ class ScannerGlobal {
}

// Returns the directory hash if it exists or -1 if it doesn't.
inline int directoryHashInDatabase(const QString& directoryPath) const {
inline mixxx::cache_key_t directoryHashInDatabase(const QString& directoryPath) const {
return m_directoryHashes.value(directoryPath, -1);
}

Expand Down Expand Up @@ -177,7 +178,7 @@ class ScannerGlobal {
TaskWatcher m_watcher;

QSet<QString> m_trackLocations;
QHash<QString, int> m_directoryHashes;
QHash<QString, mixxx::cache_key_t> m_directoryHashes;

mutable QMutex m_supportedExtensionsMatcherMutex;
QRegExp m_supportedExtensionsMatcher;
Expand Down
2 changes: 1 addition & 1 deletion src/library/scanner/scannertask.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ScannerTask : public QObject, public QRunnable {
void taskDone(bool success);
void queueTask(ScannerTask* pTask);
void directoryHashedAndScanned(const QString& directoryPath,
bool newDirectory, int hash);
bool newDirectory, mixxx::cache_key_t hash);
void directoryUnchanged(const QString& directoryPath);
void trackExists(const QString& filePath);
void addNewTrack(const QString& filePath);
Expand Down
2 changes: 2 additions & 0 deletions src/mixxxapplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "soundio/soundmanagerutil.h"
#include "track/track.h"
#include "track/trackref.h"
#include "util/cache.h"
#include "util/color/rgbcolor.h"
#include "util/math.h"

Expand Down Expand Up @@ -66,6 +67,7 @@ void MixxxApplication::registerMetaTypes() {
qRegisterMetaType<QList<CrateId>>();
qRegisterMetaType<TrackPointer>();
qRegisterMetaType<mixxx::ReplayGain>("mixxx::ReplayGain");
qRegisterMetaType<mixxx::cache_key_t>("mixxx::cache_key_t");
qRegisterMetaType<mixxx::Bpm>("mixxx::Bpm");
qRegisterMetaType<mixxx::Duration>("mixxx::Duration");
qRegisterMetaType<std::optional<mixxx::RgbColor>>("std::optional<mixxx::RgbColor>");
Expand Down
41 changes: 41 additions & 0 deletions src/test/cache_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "util/cache.h"

#include <gtest/gtest.h>

#include <QtDebug>

namespace {

class CacheTest : public testing::Test {
};

TEST_F(CacheTest, InvalidCacheKey) {
EXPECT_FALSE(mixxx::isValidCacheKey(
mixxx::invalidCacheKey()));
}

TEST_F(CacheTest, InvalidCacheKeyEmptyByteArray) {
EXPECT_FALSE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest(QByteArray())));
EXPECT_FALSE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest("")));
EXPECT_FALSE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest("\0")));
}

TEST_F(CacheTest, ValidCacheKey) {
EXPECT_TRUE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest(QByteArrayLiteral("test"))));
}

TEST_F(CacheTest, ValidCacheKeySingleZeroAscii) {
EXPECT_TRUE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest(QByteArray("0"))));
}

TEST_F(CacheTest, ValidCacheKeySingleZeroByte) {
EXPECT_TRUE(mixxx::isValidCacheKey(
mixxx::cacheKeyFromMessageDigest(QByteArray("\0", 1))));
}

} // anonymous namespace