Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 255 lines (224 sloc) 6.56 KB
#!/usr/bin/env ruby
require File.expand_path(File.dirname(__FILE__) + '/shared')
require 'pp'
class MonitorCpu
def initialize(interval, total_threshold, per_core_threshold)
@interval = interval
@total_threshold = total_threshold
@per_core_threshold = per_core_threshold
@start_time = Time.now
@iteration = 1
@cpu_usages = []
end
def start
while true
measure_and_take_action!
end
end
def measure_and_take_action!
while @cpu_usages.size >= @interval
@cpu_usages.shift
end
@cpu_usages << measure_cpu_usage
if current_time - @start_time >= @interval * 60 && (exceeded_cpu_id = threshold_exceeded?)
send_alert(exceeded_cpu_id)
sleep(60)
@start_time = current_time
@iteration = 1
@cpu_usages.clear
return true
else
sleep_time = next_wake_time - current_time
sleep(sleep_time) if sleep_time > 0
@iteration += 1
return false
end
end
def measurement_points
return @cpu_usages.size
end
def average(cpu_id = :all)
total = 0
@cpu_usages.each do |cpu_usage|
total += cpu_usage[cpu_id]
end
return total / @cpu_usages.size
end
def threshold_exceeded?
@cpu_usages.last.each_key do |cpu_id|
usage = average(cpu_id)
if cpu_id == :all && usage >= @total_threshold || usage >= @per_core_threshold
return cpu_id
end
end
return nil
end
private
def sleep(sleep_time)
Kernel.sleep(sleep_time)
end
def current_time
return Time.now
end
def query_cpu_stats
result = {}
stat = File.read("/proc/stat").split("\n")
stat.each do |line|
# Filter out the CPU statistics lines.
if line =~ /^cpu(\d*) /
cpu_id = $1.empty? ? :all : $1.to_i
columns = line.split(/\s+/)
# Discard the "cpu" prefix.
columns.shift
total = 0
idle = columns[3].to_i
columns.each do |column|
total += column.to_i
end
result[cpu_id] = { :total => total, :idle => idle }
end
end
return result
end
def measure_cpu_usage(sleep_time = 1)
stats1 = query_cpu_stats
sleep sleep_time
stats2 = query_cpu_stats
result = {}
stats1.each_key do |cpu_id|
diff_total = stats2[cpu_id][:total] - stats1[cpu_id][:total]
diff_idle = stats2[cpu_id][:idle] - stats1[cpu_id][:idle]
if diff_total == 0
diff_usage = 0.0
else
diff_usage = 100 * (diff_total - diff_idle) / diff_total.to_f
end
result[cpu_id] = diff_usage
end
return result
end
def next_wake_time
return @start_time + @iteration * 60
end
def send_alert(exceeded_cpu_id)
average = sprintf("%.1f", self.average(exceeded_cpu_id))
if exceeded_cpu_id == :all
message = "Average total CPU usage over the past #{@interval} minutes is #{average}% (>= #{@total_threshold}%)."
else
message = "Average CPU usage of core #{exceeded_cpu_id} over the past #{@interval} minutes is #{average}% (>= #{@per_core_threshold}%)."
end
email(config(:from), config(:to), config(:subject), message)
end
end
if defined?(Spec) || defined?(RSpec)
describe MonitorCpu do
before :each do
@monitor = MonitorCpu.new(5, 90, 90)
@monitor.stub!(:sleep).and_return do |sleep_time|
new_time = @monitor.send(:current_time) + sleep_time
@monitor.stub!(:current_time).and_return(new_time)
end
@monitor.stub!(:email)
@monitor.stub!(:config).with(:from).and_return("noreply@phusion.nl")
@monitor.stub!(:config).with(:to).and_return("info@phusion.nl")
@monitor.stub!(:config).with(:subject).and_return("CPU usage exceeded!")
@now = Time.now
end
def mock_time(minutes_passed)
@monitor.stub!(:current_time).and_return(@now + minutes_passed * 60)
end
def mock_cpu_usage(usage)
@monitor.stub!(:measure_cpu_usage).and_return(usage)
end
it "sends an alert and resets the state when the average total CPU over a period of time exceeds the threshold" do
5.times do |i|
mock_time(i)
mock_cpu_usage(:all => 100, 0 => 100)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == i + 1
@monitor.average.should == 100
end
mock_time(5)
mock_cpu_usage(:all => 100, 0 => 100)
@monitor.measure_and_take_action!.should be_true
@monitor.measurement_points.should == 0
5.times do |i|
mock_time(6 + i)
mock_cpu_usage(:all => 100, 0 => 100)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == i + 1
@monitor.average.should == 100
end
mock_time(11)
mock_cpu_usage(:all => 100, 0 => 100)
@monitor.measure_and_take_action!.should be_true
@monitor.measurement_points.should == 0
end
it "sends an alert and resets the state when the average CPU of a single core over a period of time exceeds the threshold" do
5.times do |i|
mock_time(i)
mock_cpu_usage(:all => 50, 0 => 100, 1 => 0)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == i + 1
@monitor.average.should == 50
@monitor.average(0).should == 100
@monitor.average(1).should == 0
end
mock_time(5)
mock_cpu_usage(:all => 50, 0 => 100, 1 => 0)
@monitor.measure_and_take_action!.should be_true
@monitor.measurement_points.should == 0
5.times do |i|
mock_time(6 + i)
mock_cpu_usage(:all => 50, 0 => 100, 1 => 0)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == i + 1
@monitor.average.should == 50
@monitor.average(0).should == 100
@monitor.average(1).should == 0
end
mock_time(11)
mock_cpu_usage(:all => 50, 0 => 100, 1 => 0)
@monitor.measure_and_take_action!.should be_true
@monitor.measurement_points.should == 0
end
it "deletes old measurement points that are no longer relevant" do
5.times do |i|
mock_time(i)
mock_cpu_usage(:all => 50, 0 => 50)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == i + 1
end
mock_time(5)
mock_cpu_usage(:all => 70, 0 => 70)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == 5
@monitor.average.should == (70 + 50 + 50 + 50 + 50) / 5.0
mock_time(6)
mock_cpu_usage(:all => 100, 0 => 100)
@monitor.measure_and_take_action!.should be_false
@monitor.measurement_points.should == 5
@monitor.average.should == (100 + 70 + 50 + 50 + 50) / 5.0
end
end
else
config(:from)
config(:to)
config(:subject)
begin
monitor = MonitorCpu.new(config(:interval), config(:total_threshold), config(:per_core_threshold))
trap 'QUIT' do
pp monitor
STDOUT.flush
end
monitor.start
rescue Interrupt
# Do nothing.
rescue SignalException => e
if e.message == "SIGTERM"
# Do nothing
else
raise
end
end
end