New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
posix profiling #5187
posix profiling #5187
Changes from 28 commits
debca06
70c708d
b7724a3
ad949af
26795ba
63bfc61
5bf80b5
c3b6e28
a9ec7bf
7d167a1
97a723b
8388a8f
061395e
8d01186
9061fb5
19a188e
c60d76d
71d4a79
6d51f4d
ef2e915
d2c37ad
1fe125a
7338b4b
7a523d5
393d0b1
feba847
39009c3
c50db35
0a43fc4
8e2a714
961b4dd
4cf9765
dcbd744
088ff52
18e6ad8
5b054c2
347f096
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/** | ||
* Copyright (c) 2014-present, Facebook, Inc. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under both the Apache 2.0 license (found in the | ||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found | ||
* in the COPYING file in the root directory of this source tree). | ||
* You may select, at your option, one of the above-listed licenses. | ||
*/ | ||
|
||
#ifdef __linux__ | ||
// Needed for linux specific RUSAGE_THREAD, before including anything else | ||
#ifndef _GNU_SOURCE | ||
#define _GNU_SOURCE | ||
#endif | ||
#endif | ||
|
||
#include <cerrno> | ||
#include <cstdint> | ||
#include <cstring> | ||
|
||
#include <sys/resource.h> | ||
#include <sys/time.h> | ||
|
||
#include <boost/format.hpp> | ||
#include <boost/io/detail/quoted_manip.hpp> | ||
|
||
#include <osquery/dispatcher/query_profiler.h> | ||
#include <osquery/killswitch.h> | ||
#include <osquery/logger.h> | ||
#include <osquery/numeric_monitoring.h> | ||
|
||
namespace osquery { | ||
namespace { | ||
|
||
int getRusageWho() { | ||
return | ||
#ifdef __linux__ | ||
RUSAGE_THREAD; // Linux supports more granular profiling | ||
#else | ||
RUSAGE_SELF; | ||
#endif | ||
} | ||
|
||
void recordRusageStatDifference(int64_t start_stat, | ||
int64_t end_stat, | ||
const std::string& stat_name) { | ||
if (end_stat == 0) { | ||
TLOG << "rusage field " << boost::io::quoted(stat_name) | ||
<< " is not supported"; | ||
} else if (start_stat <= end_stat) { | ||
monitoring::record( | ||
stat_name, end_stat - start_stat, monitoring::PreAggregationType::P50); | ||
} else { | ||
LOG(WARNING) << "Possible overflow detected in rusage field: " | ||
<< boost::io::quoted(stat_name); | ||
} | ||
} | ||
|
||
void recordRusageStatDifference(const struct timeval& start_stat, | ||
const struct timeval& end_stat, | ||
const std::string& stat_name) { | ||
recordRusageStatDifference( | ||
std::chrono::duration_cast<std::chrono::milliseconds>( | ||
std::chrono::seconds(start_stat.tv_sec) + | ||
std::chrono::microseconds(start_stat.tv_usec)) | ||
.count(), | ||
std::chrono::duration_cast<std::chrono::milliseconds>( | ||
std::chrono::seconds(end_stat.tv_sec) + | ||
std::chrono::microseconds(end_stat.tv_usec)) | ||
.count(), | ||
stat_name + ".millis"); | ||
} | ||
|
||
void recordRusageStatDifference(const struct rusage& start_stats, | ||
const struct rusage& end_stats, | ||
const std::string& monitoring_path_prefix) { | ||
recordRusageStatDifference( | ||
0, end_stats.ru_maxrss, monitoring_path_prefix + ".rss.max.kb"); | ||
|
||
recordRusageStatDifference(start_stats.ru_maxrss, | ||
end_stats.ru_maxrss, | ||
monitoring_path_prefix + ".rss.increase.kb"); | ||
|
||
recordRusageStatDifference(start_stats.ru_inblock, | ||
end_stats.ru_inblock, | ||
monitoring_path_prefix + ".input.load"); | ||
|
||
recordRusageStatDifference(start_stats.ru_oublock, | ||
end_stats.ru_oublock, | ||
monitoring_path_prefix + ".output.load"); | ||
|
||
recordRusageStatDifference(start_stats.ru_utime, | ||
end_stats.ru_utime, | ||
monitoring_path_prefix + ".time.user"); | ||
|
||
recordRusageStatDifference(start_stats.ru_stime, | ||
end_stats.ru_stime, | ||
monitoring_path_prefix + ".time.system"); | ||
} | ||
|
||
enum class RusageError { FatalError = 1 }; | ||
Expected<struct rusage, RusageError> callRusage() { | ||
struct rusage stats; | ||
const int who = getRusageWho(); | ||
auto rusage_status = getrusage(who, &stats); | ||
if (rusage_status != -1) { | ||
return stats; | ||
} else { | ||
return createError(RusageError::FatalError, "") | ||
<< "Linux query profiling failed. error code: " << rusage_status | ||
<< " message: " << boost::io::quoted(strerror(errno)); | ||
} | ||
} | ||
|
||
void launchQueryWithPosixProfiling(const std::string& name, | ||
std::function<Status()> launchQuery) { | ||
const auto start_time_point = std::chrono::steady_clock::now(); | ||
auto rusage_start = callRusage(); | ||
|
||
if (!rusage_start) { | ||
LOG(ERROR) << "rusage_start error: " | ||
<< rusage_start.getError().getFullMessageRecursive(); | ||
} | ||
|
||
const auto status = launchQuery(); | ||
const auto monitoring_path_prefix = | ||
(boost::format("scheduler.executing_query.%s.%s") % name % | ||
(status.ok() ? "success" : "failure")) | ||
.str(); | ||
|
||
if (rusage_start) { | ||
const auto rusage_end = callRusage(); | ||
|
||
if (rusage_end) { | ||
recordRusageStatDifference( | ||
*rusage_start, *rusage_end, monitoring_path_prefix); | ||
} else { | ||
LOG(ERROR) << "rusage_end error: " | ||
<< rusage_end.getError().getFullMessageRecursive(); | ||
} | ||
} | ||
|
||
const auto query_duration = | ||
std::chrono::duration_cast<std::chrono::microseconds>( | ||
std::chrono::steady_clock::now() - start_time_point); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think it is good idea to include the time of monitoring key creating and time of all routine in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. query monitoring is not rusage specific, that's why I don't think it should be somewhere completely separated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline. Moving steady_clock after rusage |
||
|
||
monitoring::record(monitoring_path_prefix + ".time.real.millis", | ||
query_duration.count(), | ||
monitoring::PreAggregationType::Min); | ||
} | ||
} // namespace | ||
void launchQueryWithProfiling(const std::string& name, | ||
std::function<Status()> launchQuery) { | ||
if (Killswitch::get().isPosixProfilingEnabled()) { | ||
launchQueryWithPosixProfiling(name, launchQuery); | ||
} else { | ||
launchQuery(); // Just execute the query | ||
} | ||
} | ||
|
||
} // namespace osquery |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
/** | ||
* Copyright (c) 2014-present, Facebook, Inc. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under both the Apache 2.0 license (found in the | ||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found | ||
* in the COPYING file in the root directory of this source tree). | ||
* You may select, at your option, one of the above-listed licenses. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <functional> | ||
#include <string> | ||
|
||
#include <osquery/status.h> | ||
|
||
namespace osquery { | ||
void launchQueryWithProfiling(const std::string& name, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you like to pass the return status of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch |
||
std::function<Status()> launchQuery); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/** | ||
* Copyright (c) 2014-present, Facebook, Inc. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under both the Apache 2.0 license (found in the | ||
* LICENSE file in the root directory of this source tree) and the GPLv2 (found | ||
* in the COPYING file in the root directory of this source tree). | ||
* You may select, at your option, one of the above-listed licenses. | ||
*/ | ||
|
||
#include <chrono> | ||
|
||
#include <boost/format.hpp> | ||
|
||
#include <osquery/dispatcher/query_profiler.h> | ||
#include <osquery/killswitch.h> | ||
#include <osquery/numeric_monitoring.h> | ||
|
||
namespace osquery { | ||
void launchQueryWithProfiling(const std::string& name, | ||
std::function<Status()> launchQuery) { | ||
const auto start_time_point = std::chrono::steady_clock::now(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you calculate current time time twice if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Windows query_profiler is almost copy paste of the old code. Code flow when killswitch is turned on is not really worth of optimizations. This should be removed soon after testing. |
||
const auto status = launchQuery(); | ||
const auto monitoring_path_prefix = | ||
(boost::format("scheduler.executing_query.%s.%s") % name % | ||
(status.ok() ? "success" : "failure")) | ||
.str(); | ||
const auto query_duration = | ||
std::chrono::duration_cast<std::chrono::microseconds>( | ||
std::chrono::steady_clock::now() - start_time_point); | ||
if (Killswitch::get().isWindowsProfilingEnabled()) { | ||
monitoring::record(monitoring_path_prefix + ".time.real.millis", | ||
query_duration.count(), | ||
monitoring::PreAggregationType::Min); | ||
} | ||
} | ||
} // namespace osquery |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we care about system time here? IMHO, the most important time parameter of running query is a user time and full time. If this two is fine we likely don't care about system time. Otherwise we can repeat the intended query in controllable environment and figure out more parameters. Does it makes sense for you?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see no need of trade offs here. As an example, system time gives information about how heavy syscalls we are using.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I'm just talking about more keys to send. For some monitoring systems it could be a problem :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What system are we talking about specifically? I don't think we should lose information, without very specific/practical limitation.