Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 1184 lines (1110 sloc) 49.2 KB
#!/usr/bin/env ruby
# Run with "-h" for usage, "-v" to see INFO-level output while running.
#
# Run this script as root on any kind of OpenShift host to diagnose common problems.
# It is intended to evolve quickly in response to actual problems experienced.
# It does not perform any function or send results anywhere; it just outputs
# diagnostic information which may be of some use in troubleshooting.
#
# INFO output is strictly informational.
# WARN output means something is not right but may not impair functionality.
# FAIL output means a serious problem probably impairing functionality.
#
# OpenShift Online admins may need to adjust PATH to use proper ruby and gem for broker tests.
#
# Please report false positives/negatives or other problems with this script
# via https://bugzilla.redhat.com/
#--
# Copyright 2013 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#++
class OODiag
require 'rubygems'
require 'tempfile'
def initialize(options = nil)
@options = options || @options || {
:wait => 2,
:verbose => false,
}
@errors = 0
@warns = 0
@is_broker=false
@is_node=false
@rpms={}
@project_is={}
@os_is={}
end
def run_setup
initialize
load_rpm_list
detect_is_node_or_broker
detect_project_version
detect_os_version
load_broker_rails_env if @is_broker
end
def run_tests
tests = @options[:tests]
if tests.nil? || tests.empty?
tests = self.class.instance_methods.select {|m| m.to_s.start_with? "prereq_"}
tests += self.class.instance_methods.select {|m| m.to_s.start_with? "test_"}
end
tests.each do |m|
begin
verbose "running: #{m}"
send m
rescue SkipTest
verbose "skipping #{m}"
rescue AbortTests
unless @options[:abortok]
eputs "Aborting tests according to #{m}. To run all tests anyway, use the --abortok option."
break
end
rescue StandardError => e
do_fail "error running #{m}: #{e.inspect}"
end
end
return [@warns, @errors]
end
######## UTILITIES ########
def called_from; caller[1][/`([^']*)'/, 1]; end
def eputs(msg); $stderr.write "\e[#{31}m#{msg}\e[0m\n"; end
def wputs(msg); $stderr.write "\e[#{33}m#{msg}\e[0m\n"; end
def verbose(msg)
@options[:verbose] and $stdout.write("INFO: #{msg}\n")
end
def do_fail(msg)
eputs("FAIL: #{called_from}\n" + msg)
@errors += 1
end
def do_warn(msg)
wputs("WARN: #{called_from}\n" + msg)
@warns += 1
end
class SkipTest < StandardError; end
def skip_test; raise SkipTest; end
class AbortTests < StandardError; end
def abort_tests; raise AbortTests; end
# when you need two string arrays to be the same length
def pad_arrays!(a, b)
a << "0" while a.length < b.length
b << "0" while b.length < a.length
end
def is_rpm_minimum_version?(rpm_hash, need_version="", need_release="")
# this doesn't account for epoch versions. revise if needed.
#
# get hash if we were passed a string
rpm_hash = @rpms[rpm_hash] or return false if rpm_hash.is_a? String
# split out the version/release strings
rpm_version = rpm_hash[:version].split '.'
rpm_release = rpm_hash[:release].split '.'
req_version = need_version.split '.'
req_release = need_release.split '.'
# pad array lengths with "0" entries so position comparison are equal
pad_arrays!(req_version, rpm_version)
pad_arrays!(req_release, rpm_release)
# pad entries with space and we can just compare string-wise
rpm_normalized = (rpm_version+rpm_release).map {|x| '%10s' % x}.join
req_normalized = (req_version+req_release).map {|x| '%10s' % x}.join
return rpm_normalized >= req_normalized
end
def require_rpm_minimum_version(rpm, need_version="0", need_release="0")
unless rpm_hash = @rpms[rpm]
do_fail "required rpm #{rpm} is not installed"
return
end
if is_rpm_minimum_version?(rpm_hash, need_version, need_release)
verbose "rpm #{rpm} installed with at least version #{need_version}-#{need_release}"
else
do_fail "rpm #{rpm} installed with version #{rpm_hash[:version]}-#{rpm_hash[:release]}; need at least #{need_version}-#{need_release}"
end
end
def run_script(script)
verbose "running #{script}"
output = `#{script} 2>&1`
if $?.success?
verbose "#{script} ran without error:\n--BEGIN OUTPUT--\n#{output}\n--END #{script} OUTPUT--"
else
do_fail "#{script} had errors:\n--BEGIN OUTPUT--\n#{output}\n--END #{script} OUTPUT--"
end
end
######## SETUP #############
def load_rpm_list
verbose "loading list of installed packages"
`rpm -qa --qf '%{NAME}|%{VERSION}|%{RELEASE}\n'`.split.each do |rpm|
rpm = rpm.split '|'
@rpms[rpm[0]] = {
:name => rpm[0],
:version => rpm[1],
:release => rpm[2],
}
end
return @rpms
end
def detect_is_node_or_broker
@is_broker = %w[openshift-origin-broker rhc-broker].any? {|name| @rpms[name]}
verbose "OpenShift broker installed." if @is_broker
@is_node= %w[rubygem-openshift-origin-node rhc-node].any? {|name| @rpms[name]}
verbose "OpenShift node installed." if @is_node
return @is_broker, @is_node
end
def detect_project_version
# These tests probably need a lot of work to be robust
#
@project_is = {}
# It's OpenShift Enterprise if there are any el6op RPMs on the system.
@project_is[:enterprise] = @rpms.values.select {|rpm| rpm[:release].include? 'el6op'}.length > 0
# It's OpenShift Online if not Enterprise but there are RHEL6 RPMs
@project_is[:online] = (!@project_is[:enterprise]) &&
@rpms.values.select {|rpm| rpm[:release].include? 'el6'}.length > 0
# It's OpenShift Origin otherwise...
@project_is[:origin] = !(@project_is[:enterprise] || @project_is[:online])
return @project_is
end
def detect_os_version
# looking for RHEL or Fedora version via /etc/redhat-release
# If something else... I dunno
release = File.read '/etc/redhat-release'
@os_is = {
# e.g. Fedora release 17 (Beefy Miracle)
:fedora => release.include?('Fedora'),
:fedora16 => release.include?('Fedora release 16'),
:fedora17 => release.include?('Fedora release 17'),
:fedora18 => release.include?('Fedora release 18'),
# e.g. Red Hat Enterprise Linux Server release 6.3 (Santiago)
:rhel => release.include?('Enterprise Linux'),
:rhel6 => release.include?('Enterprise Linux Server release 6'),
:rhel7 => release.include?('Enterprise Linux Server release 7'),
}
return @os_is
end
def load_broker_rails_env
begin
require "/var/www/openshift/broker/config/environment"
# Disable analytics for admin scripts
Rails.configuration.analytics[:enabled] = false
Rails.configuration.msg_broker[:rpc_options][:disctimeout] = @options[:wait]
rescue Exception => e
do_fail <<-"FAIL"
Broker application failed to load. This is most likely a gem
dependency problem. Update rubygem RPMs and restart openshift-broker
to regenerate the broker Gemfile.lock - this may fix the problem.
Another possibility is that you are running this script with a
different ruby than what is required for the broker app (1.8 vs 1.9).
***
THIS PROBLEM NEEDS TO BE RESOLVED FOR THE BROKER TO WORK.
DISABLING BROKER TESTS.
***
The error was: #{e.inspect}
FAIL
@is_broker = false
end
end
ENTERPRISE_RPMS = %w[ activemq activemq-client facter firebird
firebird-classic firebird-debuginfo firebird-devel firebird-doc
firebird-filesystem firebird-libfbclient firebird-libfbembed
firebird-superserver freetds freetds-debuginfo freetds-devel freetds-doc geos
geos-debuginfo geos-devel geos-python geos-ruby haproxy haproxy-debuginfo
jboss-eap6-index jboss-eap6-modules jenkins jenkins-plugin-openshift js
js-debuginfo js-devel libev libev-debuginfo libev-devel libev-source
libmcrypt libmcrypt-debuginfo libmcrypt-devel libmongodb mcollective
mcollective-client mcollective-common mod_bw mod_bw-debuginfo mod_passenger
mongodb mongodb-debuginfo mongodb-devel mongodb-server openshift-console
openshift-origin-console openshift-origin-broker openshift-origin-broker-util
openshift-origin-cartridge-abstract openshift-origin-cartridge-cron-1.4
openshift-origin-cartridge-diy-0.1 openshift-origin-cartridge-haproxy-1.4
openshift-origin-cartridge-jbosseap-6.0
openshift-origin-cartridge-jbossews-1.0
openshift-origin-cartridge-jbossews-2.0
openshift-origin-cartridge-jenkins-1.4
openshift-origin-cartridge-jenkins-client-1.4
openshift-origin-cartridge-mysql-5.1 openshift-origin-cartridge-perl-5.10
openshift-origin-cartridge-php-5.3 openshift-origin-cartridge-postgresql-8.4
openshift-origin-cartridge-python-2.6 openshift-origin-cartridge-ruby-1.8
openshift-origin-cartridge-ruby-1.9-scl openshift-origin-msg-node-mcollective
openshift-origin-node-util openshift-origin-port-proxy pam_openshift
pam_openshift-debuginfo perl-Algorithm-C3 perl-App-cpanminus perl-Class-C3
perl-Class-C3-XS perl-Class-C3-XS-debuginfo perl-Class-DBI perl-Class-DBI-Pg
perl-Class-Factory-Util perl-Class-MOP perl-Class-MOP-debuginfo
perl-Class-Method-Modifiers perl-DBIx-ContextualFetch perl-DBM-Deep
perl-Data-Types perl-Date-Simple perl-Date-Simple-debuginfo
perl-DateTime-Calendar-Mayan perl-DateTime-Format-Builder
perl-DateTime-Format-HTTP perl-DateTime-Format-IBeat
perl-DateTime-Format-MySQL perl-DateTime-Format-Pg
perl-DateTime-Format-Strptime perl-DateTime-debuginfo
perl-Declare-Constraints-Simple perl-Devel-GlobalDestruction
perl-Devel-GlobalDestruction-debuginfo perl-ExtUtils-AutoInstall perl-Ima-DBI
perl-Lingua-EN-Inflect perl-MRO-Compat perl-Module-Refresh perl-Moose
perl-Mouse perl-Mouse-debuginfo perl-Params-Coerce perl-Path-Class
perl-Pod-Eventual perl-SUPER perl-Scope-Guard perl-Sort-Versions
perl-Sub-Identify perl-Sub-Identify-debuginfo perl-Sub-Name
perl-Sub-Name-debuginfo perl-Test-EOL perl-Test-Fatal perl-Test-Moose
perl-Test-NoTabs perl-Test-Requires perl-Time-Piece-MySQL perl-Try-Tiny
perl-UNIVERSAL-moniker perl-UNIVERSAL-require perl-aliased php-extras
php-extras-debuginfo php-interbase php-mcrypt php-mssql php-pear-MDB2
php-pear-MDB2-Driver-pgsql php-php-gettext postgis postgis-debuginfo
postgis-docs postgis-jdbc postgis-utils postgresql-ip4r
postgresql-ip4r-debuginfo proj proj-debuginfo proj-devel proj-epsg proj-nad
proj-static python-virtualenv ragel ragel-debuginfo rhc ruby-RMagick
ruby-RMagick-debuginfo ruby-RMagick-doc ruby-hpricot ruby-json ruby-mysql
ruby-mysql-debuginfo ruby-nokogiri ruby-openshift-origin-auth-mongo
ruby-openshift-origin-common ruby-openshift-origin-controller
ruby-openshift-origin-dns-bind ruby-openshift-origin-msg-broker-mcollective
ruby-openshift-origin-node ruby-postgres ruby-postgres-debuginfo ruby-sqlite3
ruby-systemu ruby193-js ruby193-js-debuginfo ruby193-js-devel
ruby193-mod_passenger ruby193-ruby-mysql ruby193-ruby-mysql-debuginfo
ruby193-rubygem-addressable ruby193-rubygem-addressable-doc
ruby193-rubygem-bson ruby193-rubygem-bson_ext
ruby193-rubygem-bson_ext-debuginfo ruby193-rubygem-bson_ext-doc
ruby193-rubygem-chunky_png ruby193-rubygem-ci_reporter
ruby193-rubygem-compass ruby193-rubygem-compass-rails
ruby193-rubygem-compass-rails-doc ruby193-rubygem-crack
ruby193-rubygem-crack-doc ruby193-rubygem-daemon_controller
ruby193-rubygem-daemon_controller-doc ruby193-rubygem-fastthread
ruby193-rubygem-fastthread-debuginfo ruby193-rubygem-file-tail
ruby193-rubygem-file-tail-doc ruby193-rubygem-formtastic
ruby193-rubygem-formtastic-doc ruby193-rubygem-fssm ruby193-rubygem-fssm-doc
ruby193-rubygem-haml ruby193-rubygem-mongo ruby193-rubygem-mongo-doc
ruby193-rubygem-open4 ruby193-rubygem-open4-doc ruby193-rubygem-passenger
ruby193-rubygem-passenger-debuginfo ruby193-rubygem-passenger-devel
ruby193-rubygem-passenger-doc ruby193-rubygem-passenger-native
ruby193-rubygem-passenger-native-libs ruby193-rubygem-pg
ruby193-rubygem-pg-debuginfo ruby193-rubygem-pg-doc ruby193-rubygem-rdiscount
ruby193-rubygem-rdiscount-debuginfo ruby193-rubygem-rdiscount-doc
ruby193-rubygem-ruby2ruby ruby193-rubygem-ruby2ruby-doc
ruby193-rubygem-ruby_parser ruby193-rubygem-ruby_parser-doc
ruby193-rubygem-sexp_processor ruby193-rubygem-sexp_processor-doc
ruby193-rubygem-shoulda ruby193-rubygem-shoulda-doc ruby193-rubygem-simplecov
ruby193-rubygem-simplecov-doc ruby193-rubygem-simplecov-html
ruby193-rubygem-simplecov-html-doc ruby193-rubygem-spruz
ruby193-rubygem-spruz-doc ruby193-rubygem-test-unit
ruby193-rubygem-test-unit-doc ruby193-rubygem-webmock
ruby193-rubygem-webmock-doc ruby193-rubygem-xml-simple rubygem-ParseTree
rubygem-ParseTree-doc rubygem-RedCloth rubygem-RedCloth-debuginfo
rubygem-RubyInline rubygem-RubyInline-doc rubygem-ZenTest rubygem-ZenTest-doc
rubygem-abstract rubygem-abstract-doc rubygem-actionmailer rubygem-actionpack
rubygem-activemodel rubygem-activemodel-doc rubygem-activerecord
rubygem-activeresource rubygem-activesupport rubygem-addressable
rubygem-addressable-doc rubygem-archive-tar-minitar
rubygem-archive-tar-minitar-doc rubygem-arel rubygem-arel-doc rubygem-bacon
rubygem-bson rubygem-bson_ext rubygem-bson_ext-debuginfo rubygem-bson_ext-doc
rubygem-builder rubygem-bundler rubygem-bundler-doc rubygem-columnize
rubygem-commander rubygem-commander-doc rubygem-crack rubygem-crack-doc
rubygem-cucumber rubygem-daemon_controller rubygem-daemon_controller-doc
rubygem-daemons rubygem-diff-lcs rubygem-dnsruby rubygem-dnsruby-doc
rubygem-erubis rubygem-erubis-doc rubygem-fastthread rubygem-file-tail
rubygem-file-tail-doc rubygem-gherkin rubygem-gherkin-debuginfo
rubygem-gherkin-doc rubygem-highline rubygem-hoe rubygem-hoe-doc
rubygem-hpricot rubygem-hpricot-debuginfo rubygem-hpricot-doc rubygem-i18n
rubygem-i18n-doc rubygem-json rubygem-json-debuginfo rubygem-json-doc
rubygem-json_pure rubygem-json_pure-doc rubygem-mail rubygem-mail-doc
rubygem-mime-types rubygem-mime-types-doc rubygem-minitest
rubygem-minitest-doc rubygem-mocha rubygem-mocha-doc rubygem-mongo
rubygem-mongo-doc rubygem-net-ssh rubygem-net-ssh-doc rubygem-nokogiri
rubygem-nokogiri-debuginfo rubygem-nokogiri-doc rubygem-open4
rubygem-open4-doc rubygem-openshift-origin-auth-mongo
rubygem-openshift-origin-auth-remote-user rubygem-openshift-origin-common
rubygem-openshift-origin-console rubygem-openshift-origin-console-doc
rubygem-openshift-origin-controller rubygem-openshift-origin-dns-bind
rubygem-openshift-origin-msg-broker-mcollective rubygem-openshift-origin-node
rubygem-parseconfig rubygem-passenger rubygem-passenger-debuginfo
rubygem-passenger-devel rubygem-passenger-doc rubygem-passenger-native
rubygem-passenger-native-libs rubygem-polyglot rubygem-polyglot-doc
rubygem-rack rubygem-rack-mount rubygem-rack-mount-doc rubygem-rack-test
rubygem-rails rubygem-railties rubygem-railties-doc rubygem-rake-compiler
rubygem-rake-compiler-doc rubygem-rcov rubygem-rcov-debuginfo
rubygem-rcov-doc rubygem-rdoc rubygem-rdoc-doc rubygem-regin
rubygem-regin-doc rubygem-rest-client rubygem-rspec rubygem-rspec-core
rubygem-rspec-core-doc rubygem-ruby2ruby rubygem-ruby2ruby-doc
rubygem-ruby_parser rubygem-ruby_parser-doc rubygem-rubyforge
rubygem-sexp_processor rubygem-sexp_processor-doc rubygem-shoulda
rubygem-shoulda-doc rubygem-sinatra rubygem-spruz rubygem-spruz-doc
rubygem-sqlite3 rubygem-sqlite3-debuginfo rubygem-sqlite3-doc
rubygem-state_machine rubygem-state_machine-doc rubygem-stomp
rubygem-stomp-doc rubygem-systemu rubygem-systemu-doc rubygem-term-ansicolor
rubygem-test-spec rubygem-test-unit rubygem-test-unit-doc rubygem-text-format
rubygem-thor rubygem-thor-doc rubygem-thread-dump
rubygem-thread-dump-debuginfo rubygem-treetop rubygem-trollop rubygem-tzinfo
rubygem-tzinfo-doc rubygem-webmock rubygem-webmock-doc rubygem-xml-simple
rubygems source-highlight source-highlight-debuginfo source-highlight-devel
unittest
]
LOCALHOST = %w[127.0.0.1 ::1]
######## TESTS #############
#
# Methods beginning with prereq_ or test_ will be run automatically.
# prereq_ methods run before test_ methods in order to enable aborting testing
# if there are serious problems.
def prereq_dns_server_available
verbose "checking that the first server in /etc/resolv.conf responds"
server = nil
File.open('/etc/resolv.conf').each_line do |line|
next unless line =~ /^\s*nameserver\s+(\S+)/
server = $1
break
end
if server.nil?
do_fail <<-"FAIL"
Your /etc/resolv.conf does not define any nameservers.
Not much is going to work without DNS resolution.
FAIL
abort_tests
end
# just looking for a nameserver response. at this point we
# don't much care if resolution is correct; what we want
# is to avoid running a bunch more tests when the nameserver
# is obviously broken.
command = "host -W 1 do-not-expect-to-resolve. #{server}"
# IPSocket.getaddress goes through /etc/hosts;
# "host" uses DNS which is what we want.
output = `#{command} 2>&1`
unless output =~ /NXDOMAIN/
do_fail <<-"FAIL"
#{server} doesn't appear to respond to DNS requests.
This command:
#{command}
should have returned NXDOMAIN to the request.
Instead, it returned:
#{output}
Please check the following to resolve this issue:
* Does /etc/resolv.conf have your correct nameserver?
* Is your nameserver running?
* Is the firewall on your nameserver open (udp:53)?
* Can you connect to your nameserver?
Not much is going to work without DNS resolution.
FAIL
abort_tests
end
end
def test_enterprise_rpms
skip_test unless @project_is[:enterprise]
verbose "Checking that all OpenShift RPMs are actually from OpenShift Enterprise"
rogue_rpms = []
ENTERPRISE_RPMS.each do |rpm|
if @rpms.has_key?(rpm)
rel = @rpms[rpm][:release]
ver = @rpms[rpm][:version]
unless rel.include?('el6op') ||
# waive two errata RPMs that didn't get the right distro tag
rpm == 'openshift-console' && "#{ver}-#{rel}" == '0.0.5-3.el6' ||
rpm == 'openshift-origin-broker' && "#{ver}-#{rel}" == '1.0.2-1.el6'
rogue_rpms << "#{rpm} should be an OpenShift Enterprise RPM but installed version #{ver}-#{rel} does not have 'el6op' in it"
end
end
end
rogue_rpms.empty? or do_warn <<-ROGUES
The following problems were found with your RPMs: \n\t#{ rogue_rpms.join("\n\t") }
Please ensure that you have not enabled EPEL or other third-party repositories, and
do not have any of these RPMs pre-installed in your install image. These RPMs must come
from your OpenShift Enterprise subscription in order to be supported.
ROGUES
end
def test_selinux_policy_rpm
skip_test unless @os_is[:rhel6]
require_rpm_minimum_version 'selinux-policy', "3.7.19", "155.el6_3.8"
end
def test_broker_cache_permissions
skip_test unless @is_broker
if `find /var/www/openshift/broker/tmp/cache/* -user root`.length > 0
do_fail "broker application cache contains files belonging to root; please clear the cache"
else
verbose "broker application cache permissions appear fine"
end
end
def test_nodes_public_hostname
skip_test unless @is_broker
require 'socket'
verbose "checking that each public_hostname resolves properly"
names_for = Hash.new {|h,k| h[k]=[]}
#
# get the PUBLIC_HOSTNAME from every node
# and make sure it resolves and is not localhost
#
OpenShift::MCollectiveApplicationContainerProxy.rpc_get_fact("public_hostname") do |node,host|
names_for[host] << node
# test host resolution
begin
# public_hostname must resolve as a FQDN, so should be the full name
# (the "." at the end blocks adding a search domain)
resolved_host = IPSocket.getaddress(host + ".")
if LOCALHOST.member? resolved_host
do_fail "PUBLIC_HOSTNAME #{host} for #{node} should be public, not localhost"
else
verbose "PUBLIC_HOSTNAME #{host} for #{node} resolves to #{resolved_host}"
end
rescue Exception => e
do_fail "PUBLIC_HOSTNAME #{host} for #{node} does not resolve as a FQDN (#{e})"
end
end
if names_for.empty?
do_fail "No node hosts responded. Run 'mco ping' and troubleshoot if this is unexpected."
return
end
#
# check that each hostname is unique
# as it causes really confusing problems creating apps if it isn't.
#
verbose "checking that each public_hostname is unique"
names_for.each do |host,nodes|
if nodes.length > 1
do_fail "multiple node hosts have PUBLIC_HOSTNAME #{host}: #{nodes.join ','}"
end
end
end
def test_nodes_public_ip
skip_test unless @is_broker
verbose "checking that public_ip has been set for all nodes"
nodes_for = Hash.new {|h,k| h[k]=[]}
#
# get the PUBLIC_IP from every node
# and make sure it's not localhost
#
OpenShift::MCollectiveApplicationContainerProxy.rpc_get_fact("public_ip") do |node,ip|
nodes_for[ip] << node
if LOCALHOST.member? ip
do_fail "PUBLIC_IP #{ip} should be public, not localhost"
else
verbose "PUBLIC_IP #{ip} for #{node}"
end
end
nodes_for.empty? and return #nothing to do...
#
# check that public_ip is unique for all nodes
#
verbose "checking that public_ip is unique for all nodes"
nodes_for.each do |ip,nodes|
if nodes.length > 1
do_fail "multiple node hosts have public_ip #{ip}: #{nodes.join ','}"
end
end
end
def test_nodes_cartridges_from_broker
skip_test unless @is_broker
verbose "checking that all node hosts have cartridges installed"
carts_for = Hash.new
all_carts = []
#
# get the list of cartridges from every node
#
OpenShift::MCollectiveApplicationContainerProxy.rpc_get_fact("cart_list") do |node,carts|
all_carts << ( carts_for[node] = carts.split('|').sort )
if carts.empty?
do_fail "host #{node} does not have any cartridges installed"
else
verbose "cartridges for #{node}: #{carts}"
end
end
carts_for.empty? and return #nothing to do...
#
# check it's the same on every node
#
verbose "checking that same cartridges are installed on all node hosts"
all_carts = all_carts.flatten.uniq.sort
if all_carts.empty?
do_fail "no cartridges are installed; please install cartridges on your node hosts"
return
end
any_missing = false
carts_for.each do |node,carts|
missing = all_carts - carts
missing.empty? or do_fail "node #{node} cartridge list is missing #{missing.join ','}"
any_missing ||= !missing.empty?
end
#
# check against broker's probably-cached list of carts
#
unless any_missing # would likely get false positive
verbose "checking that broker's cache is not stale"
begin
require 'json'
require 'rest-client'
api_carts = JSON.parse(RestClient.get('http://localhost:8080/broker/rest/cartridges.json'))
api_carts = api_carts["data"].map {|cart| cart["name"]}
verbose "API reports carts: #{api_carts.join '|'}"
# remove abstract_carts which don't show in the API
all_carts.reject! {|cart| cart.start_with? "abstract"}
do_warn <<-"MISMATCH" unless api_carts.sort == all_carts.sort
The broker's list of cartridges does not match what is available on
the node hosts. The broker probably has cached an old list.
Broker is missing: #{(all_carts - api_carts).join ", "}
Broker has extra: #{(api_carts - all_carts).join ", "}
Clear the cache by executing:
# cd /var/www/openshift/broker
# bundle exec rake tmp:clear
MISMATCH
rescue StandardError => e
do_fail "Couldn't retrieve cartridge list from broker: #{e}"
end
end
end
def test_node_profiles_districts_from_broker
skip_test unless @is_broker
conf = "/etc/openshift/broker.conf"
conf_profiles = Rails.configuration.openshift[:gear_sizes]
default_profile = Rails.configuration.openshift[:default_gear_size]
default_allowed = Rails.configuration.openshift[:default_gear_capabilities]
#
# get the gear profile from every node
#
# a_ for the records according to actual nodes that respond
verbose "checking node profiles via MCollective"
a_profile_for = Hash.new
a_nodes_with_profile = Hash.new {|h,k| h[k] = []}
OpenShift::MCollectiveApplicationContainerProxy.rpc_get_fact("node_profile") do |node,profile|
if profile.empty?
do_fail <<-FAIL
Host #{node} does not have a profile defined.
This is a serious problem and will prevent it from hosting gears.
* Is the facter running on the host and updating /etc/mcollective/facts.yaml?
* What profile is specified in the host's /etc/openshift/resource_limits.conf?
FAIL
else
verbose "profile for #{node}: #{profile}"
a_profile_for[node] = profile
a_nodes_with_profile[profile] << node
end
end
# check that gear profiles from broker.conf match actual available node profiles;
# also, validate the broker.conf settings make sense
if conf_profiles.empty?
do_fail "No gear sizes configured; please fix VALID_GEAR_SIZES in #{conf}"
else
do_fail <<-MISSING unless conf_profiles.include? default_profile
Default gear profile is not included in valid gear sizes
"#{default_profile}" is not in #{conf_profiles.join ", "}
Attempts to create apps without specifying gear size will fail.
Please fix the settings in #{conf}
MISSING
if default_allowed # early versions didn't define this, skip if not defined
missing_profiles = default_allowed - conf_profiles
do_fail <<-MISSING unless missing_profiles.empty?
The following gear profile(s) are available to users by default
(DEFAULT_GEAR_CAPABILITIES), but not valid (VALID_GEAR_SIZES):
#{missing_profiles.join ", "}
Attempts to create apps using these gears will fail.
Please fix the settings in #{conf}
MISSING
end
if a_profile_for.empty?
do_fail <<-NONE
No node hosts found. Please install some,
or ensure the existing ones respond to 'mco ping'.
OpenShift cannot host gears without at least one node host responding.
NONE
skip_test
end
missing_profiles = conf_profiles - a_nodes_with_profile.keys
do_fail <<-MISSING unless missing_profiles.empty?
The following gear profile(s) are configured but not provided by node hosts:
#{missing_profiles.join ", "}
Attempts to create apps using these gear profiles will fail.
Please fix the settings in #{conf} or add node hosts accordingly.
MISSING
missing_profiles = a_nodes_with_profile.keys - conf_profiles
do_warn <<-MISSING unless missing_profiles.empty?
The following gear profile(s):
#{missing_profiles.join ", "}
are available on at least one node host, but not configured for the broker.
Please fix the VALID_GEAR_SIZES setting in #{conf}
or remove / reconfigure the relevant node host(s).
MISSING
end
#
# get database's list of districts and the nodes within
#
districts = District.find_all
if districts.length == 0
do_warn <<-NONE
No districts are defined. Districts should be used in any production installation.
Please consult the Administration Guide.
NONE
skip_test
end
verbose "checking that node profiles and districts are consistent"
# d_ for the records according to districts
d_profile_for = Hash.new
d_district_for = Hash.new
d_profile_active = Hash.new
d_nodes_with_profile = Hash.new {|h,k| h[k] = []}
districts.each do |district|
profile = district.node_profile
verbose "district '#{district.name}' has profile '#{profile}'"
if district.server_identities.empty?
do_warn "There are no node hosts in district '#{district.name}'"
end
district.server_identities.each do |node,v|
d_profile_for[node] = profile
d_district_for[node] = district.name
d_profile_active[profile] = true if v["active"]
d_nodes_with_profile[profile] << node
verbose " host #{node} is #{v['active']? 'active' : 'inactive'} in district '#{district.name}'"
end
end
# check that gear profiles from broker.conf are matched by district profiles
unless conf_profiles.empty?
do_fail <<-MISSING unless d_profile_active[default_profile]
Default gear profile '#{default_profile}' has no active node hosts supplying it in any district.
Attempts to create apps without specifying gear size may fail.
Please fix the settings in #{conf}
or add active node hosts to a district with profile '#{default_profile}'
using oo-admin-ctl-district.
MISSING
missing_profiles = conf_profiles - d_profile_active.keys
do_fail <<-MISSING unless missing_profiles.empty?
The following gear profile(s) are configured:
#{missing_profiles.join ", "}
but not provided by any active district hosts.
Attempts to create apps using these gears may fail.
Please fix the settings in #{conf}
or add districts / node hosts with oo-admin-ctl-district.
MISSING
missing_profiles = d_profile_active.keys - conf_profiles
do_warn <<-MISSING unless missing_profiles.empty?
The following gear profile(s) are available from at least one active district host:
#{missing_profiles.join ", "}
but not configured in broker.conf, so not usable."
Please fix the VALID_GEAR_SIZES setting in #{conf}
MISSING
end
# check for consistency between district definitions and actual nodes
a_profile_for.each do |node,profile|
# check that all nodes are in a district
if !d_profile_for[node]
do_warn <<-NODIST
Node host #{node} with profile '#{profile}' is not a member of any district.
Please add it to a district with oo-admin-ctl-district.
NODIST
# check that nodes have same profiles in district definition as in actual node hosts
elsif d_profile_for[node] != profile
do_fail <<-WRONG
Node host #{node} has profile '#{profile}'
but is in district '#{d_district_for[node]}' with profile '#{d_profile_for[node]}'
Did you change the node profile after adding it to a district?
WRONG
end
end
d_profile_for.each do |node,profile|
# check that no nodes from districts are missing
do_fail <<-MISSING if !a_profile_for[node]
Node host #{node} is a member of district '#{d_district_for[node]}' but cannot be found.
If the host exists, it is not responding via MCollective.
If it should not be in the district, please remove it with oo-admin-ctl-district.
MISSING
end
end
def test_broker_accept_scripts
skip_test unless @is_broker
if @project_is[:online]
run_script(@rpms['rhc-devenv'] ? "rhc-accept-devenv" : "rhc-accept-broker")
elsif @rpms["openshift-origin-broker-util"]
run_script("oo-accept-broker")
run_script("oo-accept-systems") if File.exists? '/usr/sbin/oo-accept-systems'
else
do_warn "openshift-origin-broker-util is not installed; you really should install it"
end
end
def test_node_accept_scripts
skip_test unless @is_node
if @rpms["openshift-origin-node-util"]
run_script("oo-accept-node")
else
do_warn "openshift-origin-node-util is not installed; you really should install it"
end
end
def expect_log_error(pattern, tmpfile)
out = `grep '#{pattern}' #{tmpfile}`
verbose "broker error_log: this message is normal for now:\n #{out}" unless out.empty?
end
def test_broker_httpd_error_log
skip_test unless @is_broker
#
# create a tmp file with all of the unique error log statements after the most recent openshift-broker start
#
tmpfile = Tempfile.new("oodiag-log-#{$$}").path
logfile = @project_is[:enterprise] ? '/var/www/openshift/broker/httpd/logs/error_log' : '/var/log/openshift/broker/httpd/error_log'
verbose "no file #{logfile}" and skip_test if ! File.exists? logfile # log file just not there
system %Q[sed -n 'H; /configured -- resuming normal operations/h; ${g;p;}' #{logfile} | sort -u > #{tmpfile}]
# if no restart msg, just use the whole thing
system %Q[sort -u #{logfile} > #{tmpfile}] if ! File.exists? tmpfile
verbose "log #{logfile} is empty" and skip_test if ! File.exists? tmpfile
#
# info about Passenger prespawn errors
expect_log_error 'Cannot execute.*prespawn.*Permission denied (13)', tmpfile
#
# look for real Passenger startup problems
out = `grep 'Passenger could not be initialized\\|Unable to start the Phusion Passenger watchdog' #{tmpfile}`
do_fail <<-"ERR" unless out.empty?
broker error_log: serious problem(s) with Passenger startup:
#{out}
The broker is probably not running correctly. rhc tool output may be confusing.
Make sure selinux-policy is updated and check the Troubleshooting Guide for tips.
ERR
#
# look for Passenger errors during operation
out = `grep 'Unexpected error in mod_passenger' #{tmpfile}`
do_fail <<-ERR unless out.empty?
broker error_log: serious problem with Passenger operation:
#{out.split("\n").first}
ERR
#
# info about mcollective client log error
out = `grep 'Could not start logger: Errno::EACCES' #{tmpfile}`
do_warn <<-FIXLOG unless out.empty?
broker error_log: this problem indicates mcollective client logging is failing:
#{out}
Please fix this issue with the following commands:
# chown apache:root /var/log/mcollective-client.log
# service openshift-broker restart
FIXLOG
#
# done with the tmp file
system "rm #{tmpfile}"
end
def test_broker_passenger_ps
skip_test unless @is_broker
verbose "checking the broker application process tree"
cmd = 'pstree -A `cat /var/www/openshift/broker/httpd/run/httpd.pid`'
out = `#{cmd}`
example = <<-"PSEX"
Output from: #{cmd}
#{out}
Should look similar to this:
httpd-+-PassengerWatchd-+-PassengerHelper-+-ruby---{ruby}
| | `-6*[{PassengerHelpe}]
| |-PassengerLoggin---{PassengerLoggi}
| `-3*[{PassengerWatch}]
`-8*[httpd]
PSEX
return do_fail "Broker httpd doesn't seem to be running at all. Try 'service openshift-broker start'" unless out.include? "httpd"
%w{PassengerWatchd PassengerHelper ruby PassengerLoggin}.find do |it|
return false if out.include?(it) # process is there, continue checking others
do_fail "pstree for broker is missing #{it}. Make sure selinux-policy RPM is up to date and check Troubleshooting guide. \n#{example}"
return true # stop with first thing that's missing
end
end
def test_for_nonrpm_rubygems
# not sure if other projects will want this. uncomment to silence:
# skip_test unless @project_is[:enterprise]
verbose "checking for presence of gem-installed rubygems"
# list out all gem directories and find those without RPM ownership
gemdirs = `gem environment gempath`.chomp.split(':').
map {|dir| dir + "/specifications"}.
select {|dir| File.exists? dir}.
map {|dir| dir + "/*.gemspec"}
verbose "looking in #{gemdirs.join ' '}"
disown = `ls #{gemdirs.join ' '} | xargs -n 1 rpm -qf`.
split("\n").select {|line| line.end_with? "not owned by any package"}
return if disown.empty?
do_warn <<-"NOOO"
The following lines indicate rubygems that were installed from outside
sources rather than via yum/RPM. These are unsupported and likely to
break OpenShift. If you see multiple versions of any gems with
'gem list' you have likely overridden OpenShift-installed gems.
Uninstall as necessary with 'gem uninstall <gemname> -v <version>'.
\n#{disown.join("\n")}
NOOO
end
def test_for_multiple_gem_versions
# not sure if other projects will want this. uncomment to silence:
# skip_test unless @project_is[:enterprise]
verbose "checking for presence of gem-installed rubygems"
multiples = `gem list`.split("\n").select {|gem| gem.include? ','}
return if multiples.empty?
do_warn <<-"NOOO"
'gem list' indicates the following rubygems are installed with multiple
versions. OpenShift gems should be installed via yum/RPM.
If you have overridden OpenShift-installed gems, expect problems.
Uninstall as necessary with 'gem uninstall <gemname> -v <version>'.
\n#{multiples.join("\n")}
NOOO
end
def test_node_httpd_error_log
skip_test unless @is_broker
#
# create a tmp file with all of the unique error log statements after the most recent httpd start
#
tmpfile = Tempfile.new("oodiag-node-#{$$}").path
logfile = '/var/log/httpd/error_log'
verbose "no file #{logfile}" and skip_test if ! File.exists? logfile # log file just not there
system %Q[sed -n 'H; /configured -- resuming normal operations/h; ${g;p;}' #{logfile} | sort -u > #{tmpfile}]
# if no restart msg, just use the whole thing
system %Q[sort -u #{logfile} > #{tmpfile}] if ! File.exists? tmpfile
verbose "log #{logfile} is empty" and skip_test if ! File.exists? tmpfile
#
# look for proxy lb error
out = `grep 'proxy: ap_get_scoreboard_lb.*failed in child .* for worker' #{tmpfile}`
verbose <<-ERR unless out.empty?
node httpd error_log: this log message is expected for now:
#{out.split("\n").first}
For details see: https://bugzilla.redhat.com/show_bug.cgi?id=892871
ERR
#
# done with the tmp file
system "rm #{tmpfile}"
end
def test_node_mco_log
skip_test unless @is_node
#
# create a tmp file with all of the unique log statements after most recent mcollective start
tmpfile = Tempfile.new("oodiag-mco-#{$$}").path
system %Q! sed -n 'H; /INFO.*The Marionette Collective.*started logging/h; ${g;p;}' /var/log/mcollective.log | sed 's|^\w, \[[^]]*\]||' | sort -u > #{tmpfile} !
# if no restart see, just use the whole thing
system %Q! sort -u /var/log/mcollective.log > #{tmpfile} ! unless File.exists? tmpfile
return unless File.exists? tmpfile
#
# Look for the mco timeout warning
out = `grep 'WARN.*created at [0-9]* is [0-9]* seconds old, TTL is' #{tmpfile}`
do_fail <<-"TIMEOUT" unless out.empty?
Node mcollective log indicates time is too far out of sync with broker:
#{out.split("\n").first}
This typically causes the node to ignore broker queries and commands.
Please ensure that all node and broker hosts are synced to the same time source.
Restart mcollective if this script is reporting an old log message.
TIMEOUT
#
# clean up temp file
system "rm #{tmpfile}"
end
def test_pam_openshift
skip_test unless @is_node
do_fail <<-"SSHD" unless `grep '^\s*[^#].*pam_selinux' /etc/pam.d/sshd`.empty?
References to pam_selinux in /etc/pam.d/sshd should all be changed
to pam_openshift. User access to gears via ssh or git is likely to
fail; please refer to the Deployment Guide for proper settings.
SSHD
%w[runuser runuser-l sshd su system-auth-ac].each do |file|
last_line = ""
File.open("/etc/pam.d/#{file}", "r").each_line do |line|
# keep last non-empty non-commented line
last_line = line if line =~ /\S/ && line =~ /^\s*[^#]/
end
do_fail <<-"PAM" unless last_line =~ /pam_namespace.so\s+no_unmount_on_close/
/etc/pam.d/#{file} should end with:
session required pam_namespace.so no_unmount_on_close
Without this, OpenShift gears as well as other processes
may see very mysterious failures. Please refer to the
Deployment Guide for proper settings.
PAM
end
end
def service_enabled?(svc)
if @os_is[:rhel6]
system "chkconfig #{svc} >& /dev/null"
else #systemd
system "systemctl is-enabled #{svc}.service >& /dev/null"
end
return $?.exitstatus == 0
end
def service_started?(svc)
if @os_is[:rhel6]
system "service #{svc} status >& /dev/null"
else #systemd
system "systemctl status #{svc}.service >& /dev/null"
end
return $?.exitstatus == 0
end
def test_services_enabled
fail_without = []
warn_without = []
if @is_broker
fail_without += %w{httpd network sshd}
fail_without += @project_is[:online] ? %w{rhc-broker rhc-site} : %w{openshift-broker}
warn_without += %w{ntpd}
end
if @is_node
fail_without += %w{httpd network sshd openshift-port-proxy openshift-gears mcollective
cgconfig cgred openshift-cgroups }
fail_without += %w{openshift-node-web-proxy } unless @project_is[:enterprise]
warn_without += %w{ntpd oddjobd messagebus}
end
skip_test if fail_without.empty? && warn_without.empty?
#
verbose "checking that required services are running now"
missing = fail_without.uniq.select {|svc| !service_started? svc}
do_fail <<-REQ unless missing.empty?
The following service(s) are not currently started:
#{missing.join ", "}
These services are required for OpenShift functionality.
Please "service <service> start" for any not currently started.
REQ
#
missing = warn_without.uniq.select {|svc| !service_started? svc}
do_warn <<-WANT unless missing.empty?
The following service(s) are not currently started:
#{missing.join ", "}
Unless you know they are not needed, use "service <service> start" for any
that are not currently started.
WANT
#
verbose "checking that required services are enabled at boot"
missing = fail_without.uniq.select {|svc| !service_enabled? svc}
do_fail <<-REQ unless missing.empty?
The following service(s) are not started at boot time:
#{missing.join ", "}
These services are required for OpenShift functionality.
Please "chkconfig <service> on" for each to ensure that they start at boot,
and "service <service> start" for any not currently running.
REQ
#
missing = warn_without.uniq.select {|svc| !service_enabled? svc}
do_warn <<-WANT unless missing.empty?
The following service(s) are not started at boot time:
#{missing.join ", "}
Unless you know they are not needed, use "chkconfig <service> on" for each
to ensure that they start at boot and use "service <service> start" for any
that are not currently started.
WANT
end
def test_node_quota_bug
skip_test unless @is_node && @os_is[:rhel6]
verbose "testing for quota creation failure bug"
# if the gear home doesn't have its own file system, bug doesn't apply
return if `df -P /var/lib/openshift | grep /var/lib/openshift`.empty?
# if the quota file was created, we're probably set (assuming correct SELinux context)
return if File.exists? '/var/lib/openshift/aquota.user'
# if selinux policy is at least selinux-policy-3.7.19-155.el6_3.14 it's fixed
do_fail <<-BUG unless is_rpm_minimum_version?(%w[selinux-policy 3.7.19 155.el6_3.14])
There is a bug for initializing gear quotas when /var/lib/openshift has
its own partition. See https://bugzilla.redhat.com/show_bug.cgi?id=880369#c29
for a workaround (and the rest of the bug for details). There is a fix in
selinux-policy-3.7.19-155.el6_3.14
BUG
end
def test_vhost_servernames
skip_test unless @is_node || @is_broker
verbose "checking for vhost interference problems"
file_for_name = {}
name_for_file = {}
`httpd -S 2> /dev/null`.split("\n").map do |line|
if %r! 443\s+(?:namevhost\s+)? ([\w.-]+) \s+ \(.+/([^/]+)\.conf: !x.match(line)
# e.g "*:443 localhost (/etc/httpd/conf.d/ssl.conf:25)"
# or "port 443 namevhost example.com (/etc/httpd/conf.d/ssl.conf:74)"
file_for_name[$1] ||= $2
name_for_file[$2] = $1
end
end
if file_for_name[name_for_file["ssl"]] == "ssl"
# ssl.conf has the first/only vhost with its servername
do_warn <<-CONFLICTS
The VirtualHost defined in /etc/httpd/conf.d/ssl.conf has the ServerName
#{name_for_file["ssl"]} and will respond with a 404 to all requests at
https://#{name_for_file["ssl"]}/
Please remove it by running this command:
sed -i '/VirtualHost/,/VirtualHost/ d' /etc/httpd/conf.d/ssl.conf
CONFLICTS
elsif name_for_file["ssl"]
# well, we just don't want the ssl.conf vhost anyway.
do_warn <<-REMOVE
The VirtualHost defined by default in /etc/httpd/conf.d/ssl.conf is not needed
and can cause spurious warnings. Please remove it by running this command:
sed -i '/VirtualHost/,/VirtualHost/ d' /etc/httpd/conf.d/ssl.conf
REMOVE
end
bname = name_for_file['000000_openshift_origin_broker_proxy']
nname = name_for_file['000001_openshift_origin_node']
if bname && nname && bname != nname
# broker and node both installed, node may be stealing traffic
do_warn <<-CONFLICT
/etc/httpd/conf.d/000001_openshift_origin_node.conf defines a *:443 VirtualHost
with ServerName #{nname} which may intercept requests by that name
intended for the broker. Please reconfigure this vhost with the same ServerName
as the one in /etc/httpd/conf.d/000000_openshift_origin_broker_proxy.conf
CONFLICT
end
end
def test_altered_package_owned_configs
mlocate = `rpm -q mlocate`
if mlocate.empty?
do_warn <<-WARN
The mlocate package is not installed. mlocate is not a required runtime package; however,
you may install mlocate to enable further diagnostics checking.
WARN
end
unless mlocate.empty?
`updatedb`
out = `locate --regex \\.rpmsave\$ \\.rpmnew\$`
do_warn <<-"WARN" unless out.empty?
RPM package owned configuration files have been altered:
#{out}
Ensure any package-owned configuration files which have been
altered are accurate. This may require a manual merge of
your previous alterations. Once you are comfortable with the merge,
remove the reported .rpm* configuration file (or you will continue
to see this warning each time you run the diagnostic test).
WARN
end
end
def test_broken_httpd_version
httpd = @rpms['httpd'] or skip_test
version = "#{httpd[:version]}-#{httpd[:release]}"
%w{ 2.2.22-14.ep6.el6 2.2.17-15.4.ep5.el6 }.include? version and do_fail <<-BORKED
httpd-#{version} is installed. This version includes serious known issues that
impact OpenShift operations. Please upgrade or downgrade httpd accordingly.
For details see: https://bugzilla.redhat.com/show_bug.cgi?id=893884
BORKED
end
end #class OODiag
############ EXECUTION ##########
#
# If this script is running directly, just go ahead and run tests.
# In a different context (e.g. irb) just load and don't run anything.
if __FILE__ == $0
#
# Options parsing...
#
require 'optparse'
options = {
:wait => 2,
:verbose => false,
}
optparse = OptionParser.new { |opts|
opts.banner = <<-"USAGE"
#{$0}: Detect common problems on OpenShift systems
Usage: #{$0} [switches] [test methods to run]
Example: #{$0}
Example: #{$0} -v -w 1 test_nodes_public_hostname
Switches:
USAGE
opts.on('-v','--verbose', 'Print verbose statements') { |verbose| options[:verbose] = verbose }
opts.on('-w','--wait seconds', Float,
'Seconds for broker to wait for node responses (default 2)') { |wait| options[:wait] = wait }
opts.on('-o','--abortok', 'Continue tests even when an abort is thrown by a test') { |abortok| options[:abortok] = abortok }
opts.on('-h','--help', 'Print usage') { puts opts; exit 0 }
}
begin
optparse.parse!
rescue OptionParser::InvalidArgument => e
puts "\n ##### #{e.message} #####"
puts optparse.to_s
puts "\n ##### #{e.message} #####"
puts
exit 1
end
options[:tests] = ARGV
#
# execute
#
o = OODiag.new(options)
o.run_setup
warns, errors = o.run_tests
#
# summarize
#
o.wputs "#{warns} WARNINGS" if warns > 0
if errors > 0
o.eputs "#{errors} ERRORS"
else
puts "NO ERRORS"
end
exit errors
end