Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

posix profiling #5187

Merged
merged 37 commits into from Sep 14, 2018
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
debca06
linux extra query tracking
Sep 5, 2018
70c708d
maxrss should not yeld delta
Sep 5, 2018
b7724a3
support posix and linux
Sep 6, 2018
ad949af
sameify
Sep 6, 2018
26795ba
micro to mili
Sep 6, 2018
63bfc61
micros
Sep 6, 2018
5bf80b5
comment upper case
Sep 6, 2018
c3b6e28
logging level
Sep 6, 2018
a9ec7bf
linux to posix [killswitch]
Sep 6, 2018
7d167a1
Merge branch 'master' into query-benchmark/linux
guliashvili Sep 6, 2018
97a723b
audit
Sep 6, 2018
8388a8f
rename
Sep 6, 2018
061395e
separate profilings
Sep 6, 2018
8d01186
killswitch
Sep 6, 2018
9061fb5
audit
Sep 6, 2018
19a188e
killswitch
Sep 6, 2018
c60d76d
license
Sep 6, 2018
71d4a79
compilation fix
Sep 6, 2018
6d51f4d
windows profiling fix
Sep 6, 2018
ef2e915
additional include
Sep 6, 2018
d2c37ad
include
Sep 6, 2018
1fe125a
64 bit int
guliashvili Sep 6, 2018
7338b4b
Merge branch 'query-benchmark/linux' of github.com:guliashvili/osquer…
guliashvili Sep 6, 2018
7a523d5
milis rearange
guliashvili Sep 6, 2018
393d0b1
typo
guliashvili Sep 6, 2018
feba847
typo
guliashvili Sep 7, 2018
39009c3
Merge branch 'master' into query-benchmark/linux
Sep 7, 2018
c50db35
Merge branch 'query-benchmark/linux' of github.com:guliashvili/osquer…
Sep 7, 2018
0a43fc4
@akindyakov review fix
Sep 10, 2018
8e2a714
P50->P99 @mkareta review
Sep 11, 2018
961b4dd
decouple qouery_profiler from dispatcher
Sep 12, 2018
4cf9765
code audit
Sep 13, 2018
dcbd744
profiler redesign @akindiakov
Sep 14, 2018
088ff52
Merge branch 'master' into query-benchmark/linux
guliashvili Sep 14, 2018
18e6ad8
windows fix
Sep 14, 2018
5b054c2
nested class
Sep 14, 2018
347f096
compilation fix
Sep 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion include/osquery/killswitch.h
Expand Up @@ -7,11 +7,14 @@
* (found in the COPYING file in the root directory of this source tree). You
* may select, at your option, one of the above-listed licenses.
*/

#pragma once

#include <string>

#include <boost/core/noncopyable.hpp>

#include <osquery/core.h>
#include <osquery/expected.h>
#include <osquery/status.h>

Expand All @@ -31,6 +34,10 @@ class Killswitch : private boost::noncopyable {
public:
virtual ~Killswitch();

// Author: @guliashvili
// Creation Time: 5/09/2018
bool isPosixProfilingEnabled();

// Author: @guliashvili
// Creation Time: 4/09/2018
bool isTotalQueryCounterMonitorEnabled();
Expand All @@ -41,7 +48,7 @@ class Killswitch : private boost::noncopyable {

// Author: @guliashvili
// Creation Time: 3/09/2018
bool isExecutingQueryMonitorEnabled();
bool isWindowsProfilingEnabled();

// Author: @guliashvili
// Creation Time: 24/08/2018
Expand Down
14 changes: 14 additions & 0 deletions osquery/dispatcher/CMakeLists.txt
Expand Up @@ -21,7 +21,21 @@ ADD_OSQUERY_LIBRARY(FALSE osquery_dispatcher_runners
"${CMAKE_CURRENT_LIST_DIR}/scheduler.h"
"${CMAKE_CURRENT_LIST_DIR}/distributed_runner.cpp"
"${CMAKE_CURRENT_LIST_DIR}/distributed_runner.h"
"${CMAKE_CURRENT_LIST_DIR}/query_profiler.h"
)
if(POSIX)
target_sources(libosquery
PRIVATE
"${CMAKE_CURRENT_LIST_DIR}/posix/query_profiler.cpp"
)
endif()

if(WINDOWS)
target_sources(libosquery
PRIVATE
"${CMAKE_CURRENT_LIST_DIR}/windows/query_profiler.cpp"
)
endif()

ADD_OSQUERY_TEST_ADDITIONAL(
"${CMAKE_CURRENT_LIST_DIR}/tests/scheduler_tests.cpp"
Expand Down
162 changes: 162 additions & 0 deletions osquery/dispatcher/posix/query_profiler.cpp
@@ -0,0 +1,162 @@
/**
* Copyright (c) 2014-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the Apache 2.0 license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#ifdef __linux__
// Needed for linux specific RUSAGE_THREAD, before including anything else
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#endif

#include <cerrno>
#include <cstdint>
#include <cstring>

#include <sys/resource.h>
#include <sys/time.h>

#include <boost/format.hpp>
#include <boost/io/detail/quoted_manip.hpp>

#include <osquery/dispatcher/query_profiler.h>
#include <osquery/killswitch.h>
#include <osquery/logger.h>
#include <osquery/numeric_monitoring.h>

namespace osquery {
namespace {

int getRusageWho() {
return
#ifdef __linux__
RUSAGE_THREAD; // Linux supports more granular profiling
#else
RUSAGE_SELF;
#endif
}

void recordRusageStatDifference(int64_t start_stat,
int64_t end_stat,
const std::string& stat_name) {
if (end_stat == 0) {
TLOG << "rusage field " << boost::io::quoted(stat_name)
<< " is not supported";
} else if (start_stat <= end_stat) {
monitoring::record(
stat_name, end_stat - start_stat, monitoring::PreAggregationType::P50);
} else {
LOG(WARNING) << "Possible overflow detected in rusage field: "
<< boost::io::quoted(stat_name);
}
}

void recordRusageStatDifference(const struct timeval& start_stat,
const struct timeval& end_stat,
const std::string& stat_name) {
recordRusageStatDifference(
std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::seconds(start_stat.tv_sec) +
std::chrono::microseconds(start_stat.tv_usec))
.count(),
std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::seconds(end_stat.tv_sec) +
std::chrono::microseconds(end_stat.tv_usec))
.count(),
stat_name + ".millis");
}

void recordRusageStatDifference(const struct rusage& start_stats,
const struct rusage& end_stats,
const std::string& monitoring_path_prefix) {
recordRusageStatDifference(
0, end_stats.ru_maxrss, monitoring_path_prefix + ".rss.max.kb");

recordRusageStatDifference(start_stats.ru_maxrss,
end_stats.ru_maxrss,
monitoring_path_prefix + ".rss.increase.kb");

recordRusageStatDifference(start_stats.ru_inblock,
end_stats.ru_inblock,
monitoring_path_prefix + ".input.load");

recordRusageStatDifference(start_stats.ru_oublock,
end_stats.ru_oublock,
monitoring_path_prefix + ".output.load");

recordRusageStatDifference(start_stats.ru_utime,
end_stats.ru_utime,
monitoring_path_prefix + ".time.user");

recordRusageStatDifference(start_stats.ru_stime,
end_stats.ru_stime,
monitoring_path_prefix + ".time.system");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we care about system time here? IMHO, the most important time parameter of running query is a user time and full time. If this two is fine we likely don't care about system time. Otherwise we can repeat the intended query in controllable environment and figure out more parameters. Does it makes sense for you?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see no need of trade offs here. As an example, system time gives information about how heavy syscalls we are using.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I'm just talking about more keys to send. For some monitoring systems it could be a problem :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What system are we talking about specifically? I don't think we should lose information, without very specific/practical limitation.

}

enum class RusageError { FatalError = 1 };
Expected<struct rusage, RusageError> callRusage() {
struct rusage stats;
const int who = getRusageWho();
auto rusage_status = getrusage(who, &stats);
if (rusage_status != -1) {
return stats;
} else {
return createError(RusageError::FatalError, "")
<< "Linux query profiling failed. error code: " << rusage_status
<< " message: " << boost::io::quoted(strerror(errno));
}
}

void launchQueryWithPosixProfiling(const std::string& name,
std::function<Status()> launchQuery) {
const auto start_time_point = std::chrono::steady_clock::now();
auto rusage_start = callRusage();

if (!rusage_start) {
LOG(ERROR) << "rusage_start error: "
<< rusage_start.getError().getFullMessageRecursive();
}

const auto status = launchQuery();
const auto monitoring_path_prefix =
(boost::format("scheduler.executing_query.%s.%s") % name %
(status.ok() ? "success" : "failure"))
.str();

if (rusage_start) {
const auto rusage_end = callRusage();

if (rusage_end) {
recordRusageStatDifference(
*rusage_start, *rusage_end, monitoring_path_prefix);
} else {
LOG(ERROR) << "rusage_end error: "
<< rusage_end.getError().getFullMessageRecursive();
}
}

const auto query_duration =
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now() - start_time_point);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it is good idea to include the time of monitoring key creating and time of all routine in recordRusageStatDifference in query_duration? Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

query monitoring is not rusage specific, that's why I don't think it should be somewhere completely separated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline. Moving steady_clock after rusage


monitoring::record(monitoring_path_prefix + ".time.real.millis",
query_duration.count(),
monitoring::PreAggregationType::Min);
}
} // namespace
void launchQueryWithProfiling(const std::string& name,
std::function<Status()> launchQuery) {
if (Killswitch::get().isPosixProfilingEnabled()) {
launchQueryWithPosixProfiling(name, launchQuery);
} else {
launchQuery(); // Just execute the query
}
}

} // namespace osquery
21 changes: 21 additions & 0 deletions osquery/dispatcher/query_profiler.h
@@ -0,0 +1,21 @@
/**
* Copyright (c) 2014-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the Apache 2.0 license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#pragma once

#include <functional>
#include <string>

#include <osquery/status.h>

namespace osquery {
void launchQueryWithProfiling(const std::string& name,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you like to pass the return status of launchQuery as a return status here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch

std::function<Status()> launchQuery);
}
25 changes: 6 additions & 19 deletions osquery/dispatcher/scheduler.cpp
Expand Up @@ -8,9 +8,11 @@
* You may select, at your option, one of the above-listed licenses.
*/

#include <algorithm>
#include <ctime>

#include <boost/format.hpp>
#include <boost/io/detail/quoted_manip.hpp>

#include <osquery/config.h>
#include <osquery/core.h>
Expand All @@ -24,6 +26,7 @@

#include "osquery/config/parsers/decorators.h"
#include "osquery/core/process.h"
#include "osquery/dispatcher/query_profiler.h"
#include "osquery/dispatcher/scheduler.h"
#include "osquery/sql/sqlite_util.h"

Expand Down Expand Up @@ -92,8 +95,7 @@ SQLInternal monitor(const std::string& name, const ScheduledQuery& query) {
return sql;
}

inline Status launchQuery(const std::string& name,
const ScheduledQuery& query) {
Status launchQuery(const std::string& name, const ScheduledQuery& query) {
// Execute the scheduled query and create a named query object.
LOG(INFO) << "Executing scheduled query " << name << ": " << query.query;
runDecorators(DECORATE_ALWAYS);
Expand Down Expand Up @@ -173,22 +175,6 @@ inline Status launchQuery(const std::string& name,
return status;
}

inline void launchQueryWithProfiling(const std::string& name,
const ScheduledQuery& query) {
auto start_time_point = std::chrono::steady_clock::now();
auto status = launchQuery(name, query);
auto query_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - start_time_point);
if (Killswitch::get().isExecutingQueryMonitorEnabled()) {
auto monitoring_path = boost::format("scheduler.executing_query.%s.%s") %
name % (status.ok() ? "success" : "failure");

monitoring::record(monitoring_path.str(),
query_duration.count(),
monitoring::PreAggregationType::Min);
}
}

void SchedulerRunner::start() {
// Start the counter at the second.
auto i = osquery::getUnixTime();
Expand All @@ -199,7 +185,8 @@ void SchedulerRunner::start() {
if (query.splayed_interval > 0 && i % query.splayed_interval == 0) {
TablePlugin::kCacheInterval = query.splayed_interval;
TablePlugin::kCacheStep = i;
launchQueryWithProfiling(name, query);
launchQueryWithProfiling(
name, std::bind(launchQuery, std::ref(name), std::ref(query)));
}
}));
// Configuration decorators run on 60 second intervals only.
Expand Down
37 changes: 37 additions & 0 deletions osquery/dispatcher/windows/query_profiler.cpp
@@ -0,0 +1,37 @@
/**
* Copyright (c) 2014-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the Apache 2.0 license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include <chrono>

#include <boost/format.hpp>

#include <osquery/dispatcher/query_profiler.h>
#include <osquery/killswitch.h>
#include <osquery/numeric_monitoring.h>

namespace osquery {
void launchQueryWithProfiling(const std::string& name,
std::function<Status()> launchQuery) {
const auto start_time_point = std::chrono::steady_clock::now();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you calculate current time time twice if Killswitch::get().isWindowsProfilingEnabled() is false? Would you like to do something like posix implementation here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows query_profiler is almost copy paste of the old code. Code flow when killswitch is turned on is not really worth of optimizations. This should be removed soon after testing.

const auto status = launchQuery();
const auto monitoring_path_prefix =
(boost::format("scheduler.executing_query.%s.%s") % name %
(status.ok() ? "success" : "failure"))
.str();
const auto query_duration =
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now() - start_time_point);
if (Killswitch::get().isWindowsProfilingEnabled()) {
monitoring::record(monitoring_path_prefix + ".time.real.millis",
query_duration.count(),
monitoring::PreAggregationType::Min);
}
}
} // namespace osquery
8 changes: 6 additions & 2 deletions osquery/killswitch/killswitch.cpp
Expand Up @@ -32,6 +32,10 @@ FLAG(string,
Killswitch::Killswitch() {}
Killswitch::~Killswitch() = default;

bool Killswitch::isPosixProfilingEnabled() {
return isNewCodeEnabled("posixProfilingSwitch");
}

bool Killswitch::isTotalQueryCounterMonitorEnabled() {
return isNewCodeEnabled("totalQueryCounterMonitorSwitch");
}
Expand All @@ -40,8 +44,8 @@ bool Killswitch::isAppStartMonitorEnabled() {
return isNewCodeEnabled("appStartMonitorSwitch");
}

bool Killswitch::isExecutingQueryMonitorEnabled() {
return isNewCodeEnabled("executingQueryMonitorSwitch");
bool Killswitch::isWindowsProfilingEnabled() {
return isNewCodeEnabled("windowsProfilingSwitch");
}

bool Killswitch::isConfigBackupEnabled() {
Expand Down