diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..fc344db28 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.build +dependencies-stamp +mongodb_exporter diff --git a/Dockerfile b/Dockerfile index c9da6214c..8c979e72e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER David Cuadrado EXPOSE 9001 ENV GOPATH /go -ENV APPPATH $GOPATH/src/github.com/dcu/mongodb_exporter +ENV APPPATH $GOPATH/src/github.com/Percona-Lab/prometheus_mongodb_exporter COPY . $APPPATH RUN apk add --update -t build-deps go git mercurial libc-dev gcc libgcc \ && cd $APPPATH && go get -d && go build -o /bin/mongodb_exporter \ diff --git a/README.md b/README.md index 30bef83ec..6dd6be185 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,45 @@ # Mongodb Exporter -Based on MongoDB exporter for prometheus.io, written in go (https://github.com/dcu/mongodb_exporter), but forked for full sharded support and structure changes. +Based on MongoDB exporter for prometheus.io, written in go (https://github.com/dcu/mongodb_exporter), but forked for full sharded support and structure changes. -## Building +### Features + +- MongoDB Server Status metrics (*cursors, operations, indexes, storage, etc*) +- MongoDB Replica Set metrics (*members, ping, replication lag, etc*) +- MongoDB Replication Oplog metrics (*size, length in time, etc*) +- MongoDB Sharding metrics (*shards, chunks, db/collections, balancer operations*) +- MongoDB WiredTiger storage-engine metrics (*currently in beta/experimental state*) + +### Building export GO_VERSION=1.5.1 # if you wish to use your system version make +### Usage + +The exporter can be started by running the '*mongodb_exporter*' binary that is created in the build step. The exporter will try to connect to '*mongodb://localhost:27017*' (no auth) as default if no options are supplied. + +It is recommended to define the following options: + +- **-mongodb.uri** - The URI of the MongoDB port (*default: mongodb://localhost:27017*) +- **-auth.user** - The optional auth username (*default: none*) +- **-auth.pass** - The optional auth password (*default: none*) +- **-web.listen-address** - The listen address of the exporter (*default: ":9001"*) +- **-log_dir** - The directory to write the log file (*default: /tmp*) + +*For more options see the help page with '-h' or '--help'* -## Note about how this works +### Note about how this works Point the process to any mongo port and it will detect if it is a mongos, replicaset member, or stand alone mongod and return the appropriate metrics for that type of node. This was done to preent the need to an exporter per type of process. -## Roadmap +### Roadmap - Document more configurations options here +- Stabilize WiredTiger support (*currently beta/experimental*) +- Add support for PerconaFT and RocksDB storage engines +- Write more go tests +### Contact +- Tim Vaillancourt - +- David Murphy - diff --git a/collector/mongod/asserts_test.go b/collector/mongod/asserts_test.go deleted file mode 100644 index 18f330c92..000000000 --- a/collector/mongod/asserts_test.go +++ /dev/null @@ -1,17 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_AssertsCollectData(t *testing.T) { - asserts := &AssertsStats{ - Regular: 1, - Warning: 2, - Msg: 3, - User: 4, - Rollovers: 5, - } - - asserts.Export() -} diff --git a/collector/mongod/background_flushing_test.go b/collector/mongod/background_flushing_test.go deleted file mode 100644 index 67f27c2d8..000000000 --- a/collector/mongod/background_flushing_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_BackgroundFlushingCollectData(t *testing.T) { - stats := &FlushStats{ - Flushes: 1, - TotalMs: 2, - AverageMs: 3, - LastMs: 4, - } - - stats.Export() -} diff --git a/collector/mongod/connections_test.go b/collector/mongod/connections_test.go deleted file mode 100644 index 9bb231683..000000000 --- a/collector/mongod/connections_test.go +++ /dev/null @@ -1,15 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_ConnectionsCollectData(t *testing.T) { - stats := &ConnectionStats{ - Current: 1, - Available: 2, - TotalCreated: 3, - } - - stats.Export() -} diff --git a/collector/mongod/cursors_test.go b/collector/mongod/cursors_test.go deleted file mode 100644 index 6f0c06148..000000000 --- a/collector/mongod/cursors_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_CursorsCollectData(t *testing.T) { - cursors := &Cursors{} - - cursors.Export() -} diff --git a/collector/mongod/durability_test.go b/collector/mongod/durability_test.go deleted file mode 100644 index fb92e0b36..000000000 --- a/collector/mongod/durability_test.go +++ /dev/null @@ -1,13 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_DurabilityCollectData(t *testing.T) { - stats := &DurStats{ - TimeMs: DurTiming{}, - } - - stats.Export() -} diff --git a/collector/mongod/extra_info_test.go b/collector/mongod/extra_info_test.go deleted file mode 100644 index d8bcbf335..000000000 --- a/collector/mongod/extra_info_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_ExtraInfoCollectData(t *testing.T) { - stats := &ExtraInfo{} - - stats.Export() -} diff --git a/collector/mongod/global_lock_test.go b/collector/mongod/global_lock_test.go deleted file mode 100644 index d5533307b..000000000 --- a/collector/mongod/global_lock_test.go +++ /dev/null @@ -1,14 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_GlobalLockCollectData(t *testing.T) { - stats := &GlobalLockStats{ - CurrentQueue: &QueueStats{}, - ActiveClients: &ClientStats{}, - } - - stats.Export() -} diff --git a/collector/mongod/index_counters.go b/collector/mongod/index_counters.go index f3b1ec33b..05b621ea3 100644 --- a/collector/mongod/index_counters.go +++ b/collector/mongod/index_counters.go @@ -23,7 +23,7 @@ var ( //IndexCounterStats index counter stats type IndexCounterStats struct { - Accesses float64 `bson:"accesses` + Accesses float64 `bson:"accesses"` Hits float64 `bson:"hits"` Misses float64 `bson:"misses"` Resets float64 `bson:"resets"` diff --git a/collector/mongod/index_counters_test.go b/collector/mongod/index_counters_test.go deleted file mode 100644 index a14567737..000000000 --- a/collector/mongod/index_counters_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_IndexCountersCollectData(t *testing.T) { - stats := &IndexCounterStats{} - - stats.Export() -} diff --git a/collector/mongod/locks_test.go b/collector/mongod/locks_test.go deleted file mode 100644 index 4464671aa..000000000 --- a/collector/mongod/locks_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_LocksCollectData(t *testing.T) { - stats := &LockStatsMap{ - ".": LockStats{ - TimeLockedMicros: ReadWriteLockTimes{}, - TimeAcquiringMicros: ReadWriteLockTimes{}, - }, - } - - stats.Export() -} diff --git a/collector/mongod/memory_test.go b/collector/mongod/memory_test.go deleted file mode 100644 index ffaf0a119..000000000 --- a/collector/mongod/memory_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_MemoryCollectData(t *testing.T) { - stats := &MemStats{} - - stats.Export() -} diff --git a/collector/mongod/metrics.go b/collector/mongod/metrics.go index 20587c50f..893521d0c 100644 --- a/collector/mongod/metrics.go +++ b/collector/mongod/metrics.go @@ -397,7 +397,7 @@ type CursorStats struct { // Export exports the cursor stats. func (cursorStats *CursorStats) Export(ch chan<- prometheus.Metric) { metricsCursorTimedOutTotal.Set(cursorStats.TimedOut) - metricsCursorOpen.WithLabelValues("noTimeout").Set(cursorStats.Open.NoTimeout) + metricsCursorOpen.WithLabelValues("timed_out").Set(cursorStats.Open.NoTimeout) metricsCursorOpen.WithLabelValues("pinned").Set(cursorStats.Open.Pinned) metricsCursorOpen.WithLabelValues("total").Set(cursorStats.Open.Total) } diff --git a/collector/mongod/metrics_test.go b/collector/mongod/metrics_test.go deleted file mode 100644 index 921971234..000000000 --- a/collector/mongod/metrics_test.go +++ /dev/null @@ -1,33 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_MetricsCollectData(t *testing.T) { - stats := &MetricsStats{ - Document: &DocumentStats{}, - GetLastError: &GetLastErrorStats{ - Wtime: &BenchmarkStats{}, - }, - Operation: &OperationStats{}, - QueryExecutor: &QueryExecutorStats{}, - Record: &RecordStats{}, - Repl: &ReplStats{ - Apply: &ApplyStats{ - Batches: &BenchmarkStats{}, - }, - Buffer: &BufferStats{}, - Network: &MetricsNetworkStats{ - GetMores: &BenchmarkStats{}, - }, - PreloadStats: &PreloadStats{ - Docs: &BenchmarkStats{}, - Indexes: &BenchmarkStats{}, - }, - }, - Storage: &StorageStats{}, - } - - stats.Export() -} diff --git a/collector/mongod/network_test.go b/collector/mongod/network_test.go deleted file mode 100644 index a98bbd2a8..000000000 --- a/collector/mongod/network_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_NetworkCollectData(t *testing.T) { - stats := &NetworkStats{} - - stats.Export() -} diff --git a/collector/mongod/op_counters_test.go b/collector/mongod/op_counters_test.go deleted file mode 100644 index 53f6bbd11..000000000 --- a/collector/mongod/op_counters_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongod - -import ( - "testing" -) - -func Test_OpCountersCollectData(t *testing.T) { - stats := &OpcountersStats{} - - stats.Export() -} diff --git a/collector/mongod/oplog_status.go b/collector/mongod/oplog_status.go index 9f9a05707..2a11ddea0 100644 --- a/collector/mongod/oplog_status.go +++ b/collector/mongod/oplog_status.go @@ -1,102 +1,116 @@ package collector_mongod import ( - "time" - "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/mgo.v2" - "gopkg.in/mgo.v2/bson" + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" ) var ( - oplogStatusLengthSec = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "oplog", - Name: "length_sec", - Help: "Length of oplog in seconds from head to tail", - }) - oplogStatusLengthSecNow = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "oplog", - Name: "length_sec_now", - Help: "Length of oplog in seconds from now to tail", - }) - oplogStatusSizeMB = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "oplog", - Name: "size_mb", - Help: "Size of oplog in megabytes", - }) + oplogStatusCount = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset_oplog", + Name: "items_total", + Help: "The total number of changes in the oplog", + }) + oplogStatusHeadTimestamp = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset_oplog", + Name: "head_timestamp", + Help: "The timestamp of the newest change in the oplog", + }) + oplogStatusTailTimestamp = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset_oplog", + Name: "tail_timestamp", + Help: "The timestamp of the oldest change in the oplog", + }) + oplogStatusSizeBytes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset_oplog", + Name: "size_bytes", + Help: "Size of oplog in bytes", + }, []string{"type"}) ) -func GetCollectionSizeMB(db string, collection string, session *mgo.Session) (float64) { - var collStats map[string]interface{} - err := session.DB(db).Run(bson.D{{"collStats", collection }}, &collStats) - if err != nil { - glog.Error("Error getting collection stats!") - } - - var result float64 = -1 - if collStats["size"] != nil { - size := collStats["size"].(int) - result = float64(size)/1024/1024 - } - - return result +type OplogCollectionStats struct { + Count float64 `bson:"count"` + Size float64 `bson:"size"` + StorageSize float64 `bson:"storageSize"` } -func GetOplogSizeMB(session *mgo.Session) (float64) { - return GetCollectionSizeMB("local", "oplog.rs", session) +type OplogTimestamps struct { + Tail float64 + Head float64 } -func ParseBsonMongoTsToUnix(timestamp bson.MongoTimestamp) (int64) { - return int64(timestamp >> 32) +type OplogStatus struct { + OplogTimestamps *OplogTimestamps + CollectionStats *OplogCollectionStats } -type OplogStatsData struct { - MinTime bson.MongoTimestamp `bson:"min"` - MaxTime bson.MongoTimestamp `bson:"max"` +// there's gotta be a better way to do this, but it works for now :/ +func BsonMongoTimestampToUnix(timestamp bson.MongoTimestamp) float64 { + return float64(timestamp >> 32) } -func GetOplogLengthSecs(session *mgo.Session) (float64, float64) { - results := &OplogStatsData{} - group := bson.M{ "_id" : 1, "min" : bson.M{ "$min" : "$ts" }, "max" : bson.M{ "$max" : "$ts" } } - err := session.DB("local").C("oplog.rs").Pipe([]bson.M{{ "$group" : group }}).One(&results) - if err != nil { - glog.Error("Could not get the oplog time min/max!") - return -1, -1 - } - - minTime := ParseBsonMongoTsToUnix(results.MinTime) - maxTime := ParseBsonMongoTsToUnix(results.MaxTime) - - now := time.Now().Unix() - lengthSeconds := maxTime - minTime - lengthSecondsNow := now - minTime - - return float64(lengthSeconds), float64(lengthSecondsNow) +func GetOplogTimestamps(session *mgo.Session) (*OplogTimestamps, error) { + oplogTimestamps := &OplogTimestamps{} + result := struct { + TailTimestamp bson.MongoTimestamp `bson:"tail"` + HeadTimestamp bson.MongoTimestamp `bson:"head"` + }{} + group := bson.M{ "_id" : 1, "tail" : bson.M{ "$min" : "$ts" }, "head" : bson.M{ "$max" : "$ts" } } + err := session.DB("local").C("oplog.rs").Pipe([]bson.M{{ "$group" : group }}).One(&result) + if err != nil { + return oplogTimestamps, err + } + + oplogTimestamps.Tail = BsonMongoTimestampToUnix(result.TailTimestamp) + oplogTimestamps.Head = BsonMongoTimestampToUnix(result.HeadTimestamp) + return oplogTimestamps, err } -type OplogStats struct { - LengthSec float64 - LengthSecNow float64 - SizeMB float64 +func GetOplogCollectionStats(session *mgo.Session) (*OplogCollectionStats, error) { + results := &OplogCollectionStats{} + err := session.DB("local").Run(bson.M{ "collStats" : "oplog.rs" }, &results) + return results, err } -func (status *OplogStats) Export(ch chan<- prometheus.Metric) { - oplogStatusLengthSec.Set(status.LengthSec) - oplogStatusLengthSecNow.Set(status.LengthSecNow) - oplogStatusSizeMB.Set(status.SizeMB) - oplogStatusLengthSec.Collect(ch) - oplogStatusLengthSecNow.Collect(ch) - oplogStatusSizeMB.Collect(ch) +func (status *OplogStatus) Export(ch chan<- prometheus.Metric) { + oplogStatusSizeBytes.WithLabelValues("current").Set(0) + oplogStatusSizeBytes.WithLabelValues("storage").Set(0) + if status.CollectionStats != nil { + oplogStatusCount.Set(status.CollectionStats.Count) + oplogStatusSizeBytes.WithLabelValues("current").Set(status.CollectionStats.Size) + oplogStatusSizeBytes.WithLabelValues("storage").Set(status.CollectionStats.StorageSize) + } + if status.OplogTimestamps != nil { + oplogStatusHeadTimestamp.Set(status.OplogTimestamps.Head) + oplogStatusTailTimestamp.Set(status.OplogTimestamps.Tail) + } + + oplogStatusCount.Collect(ch) + oplogStatusHeadTimestamp.Collect(ch) + oplogStatusTailTimestamp.Collect(ch) + oplogStatusSizeBytes.Collect(ch) } -func GetOplogStatus(session *mgo.Session) *OplogStats { - results := &OplogStats{} +func (status *OplogStatus) Describe(ch chan<- *prometheus.Desc) { + oplogStatusCount.Describe(ch) + oplogStatusHeadTimestamp.Describe(ch) + oplogStatusTailTimestamp.Describe(ch) + oplogStatusSizeBytes.Describe(ch) +} - results.LengthSec, results.LengthSecNow = GetOplogLengthSecs(session) - results.SizeMB = GetOplogSizeMB(session) +func GetOplogStatus(session *mgo.Session) *OplogStatus { + collectionStats, err := GetOplogCollectionStats(session) + oplogTimestamps, err := GetOplogTimestamps(session) + if err != nil { + glog.Error("Failed to get oplog status.") + return nil + } - return results + return &OplogStatus{CollectionStats:collectionStats,OplogTimestamps:oplogTimestamps} } diff --git a/collector/mongod/replset_status.go b/collector/mongod/replset_status.go index 83651b4e0..80f90e4ab 100644 --- a/collector/mongod/replset_status.go +++ b/collector/mongod/replset_status.go @@ -1,232 +1,223 @@ package collector_mongod import ( - "time" - "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/mgo.v2" - "gopkg.in/mgo.v2/bson" + "time" + + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" + + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" ) var ( - replSetLastElection = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: Namespace, - Name: "replset_last", - Help: "Last event times for replica set", - }, []string{"event"}) - replSetTotalMembers = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "replset", - Name: "members", - Help: "Number of members in replica set", - }) - replSetTotalMembersWithData = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "replset", - Name: "members_w_data", - Help: "Number of members in replica set with data", - }) - replSetMyLagMs = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "replset", - Name: "my_lag_ms", - Help: "Lag in milliseconds in reference to replica set Primary node", - }) - replSetMaxNode2NodePingMs = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "replset", - Name: "max_n2n_ping_ms", - Help: "Maximum ping in milliseconds to other replica set members", - }) + myState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "my_state", + Help: "An integer between 0 and 10 that represents the replica state of the current member", + }, []string{"set"}) + term = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "term", + Help: "The election count for the replica set, as known to this replica set member", + }, []string{"set"}) + numberOfMembers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "number_of_members", + Help: "The number of replica set mebers", + }, []string{"set"}) + heartbeatIntervalMillis = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "heatbeat_interval_millis", + Help: "The frequency in milliseconds of the heartbeats", + }, []string{"set"}) + memberHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_health", + Help: "This field conveys if the member is up (1) or down (0).", + }, []string{"set", "name", "state"}) + memberState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_state", + Help: "The value of state is an integer between 0 and 10 that represents the replica state of the member.", + }, []string{"set", "name", "state"}) + memberUptime = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_uptime", + Help: "The uptime field holds a value that reflects the number of seconds that this member has been online.", + }, []string{"set", "name", "state"}) + memberOptimeDate = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_optime_date", + Help: "The last entry from the oplog that this member applied.", + }, []string{"set", "name", "state"}) + memberElectionDate = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_election_date", + Help: "The timestamp the node was elected as replica leader", + }, []string{"set", "name", "state"}) + memberLastHeartbeat = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_last_heartbeat", + Help: "The lastHeartbeat value provides an ISODate formatted date and time of the transmission time of last heartbeat received from this member", + }, []string{"set", "name", "state"}) + memberLastHeartbeatRecv = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_last_heartbeat_recv", + Help: "The lastHeartbeatRecv value provides an ISODate formatted date and time that the last heartbeat was received from this member", + }, []string{"set", "name", "state"}) + memberPingMs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_ping_ms", + Help: "The pingMs represents the number of milliseconds (ms) that a round-trip packet takes to travel between the remote member and the local instance.", + }, []string{"set", "name", "state"}) + memberConfigVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_config_version", + Help: "The configVersion value is the replica set configuration version.", + }, []string{"set", "name", "state"}) + memberOptime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "replset", + Name: "member_optime", + Help: "Information regarding the last operation from the operation log that this member has applied.", + }, []string{"set", "name", "state"}) ) -type ReplicaSetMemberStatus struct { - Id int64 `bson:"_id"` - ConfigVersion int64 `bson:"configVersion"` - Health int64 `bson:"health"` - Name string `bson:"name"` - State int64 `bson:"state"` - StateStr string `bson:"stateStr"` - Uptime int64 `bson:"uptime"` - Optime int64 `bson:"optime"` - OptimeDate time.Time `bson:"optimeDate"` - LastHeartbeat time.Time `bson:"lastHeartbeat"` - LastHeartbeatRecv time.Time `bson:"lastHeartbeatRecv"` - ElectionTime int64 `bson:"electionTime"` - ElectionDate time.Time `bson:"electionDate"` - PingMs float64 `bson:"pingMs"` - SyncingTo string `bson:"syncingTo"` - Self bool `bson:"self"` -} - -type ReplicaSetStatus struct { - Name string `bson:"set"` - Date time.Time `bson:"date"` - MyState int `bson:"myState"` - Ok int `bson:"ok"` - Members []ReplicaSetMemberStatus `bson:"members"` +// ReplSetStatus keeps the data returned by the GetReplSetStatus method +type ReplSetStatus struct { + Set string `bson:"set"` + Date time.Time `bson:"date"` + MyState int32 `bson:"myState"` + Term *int32 `bson:"term,omitempty"` + HeartbeatIntervalMillis *float64 `bson:"heartbeatIntervalMillis,omitempty"` + Members []Member `bson:"members"` } -type ReplicaSetStatusSummary struct { - Members float64 - MembersWithData float64 - LagMs float64 - MaxNode2NodePingMs float64 - LastElection float64 +// Member represents an array element of ReplSetStatus.Members +type Member struct { + Name string `bson:"name"` + Self *bool `bson:"self,omitempty"` + Health *int32 `bson:"health,omitempty"` + State int32 `bson:"state"` + StateStr string `bson:"stateStr"` + Uptime float64 `bson:"uptime"` + Optime interface{} `bson:"optime"` + OptimeDate time.Time `bson:"optimeDate"` + ElectionTime *time.Time `bson:"electionTime,omitempty"` + ElectionDate *time.Time `bson:"electionDate,omitempty"` + LastHeartbeat *time.Time `bson:"lastHeartbeat,omitempty"` + LastHeartbeatRecv *time.Time `bson:"lastHeartbeatRecv,omitempty"` + LastHeartbeatMessage *string `bson:"lastHeartbeatMessage,omitempty"` + PingMs *float64 `bson:"pingMs,omitempty"` + SyncingTo *string `bson:"syncingTo,omitempty"` + ConfigVersion *int32 `bson:"configVersion,omitempty"` } -func GetReplSetStatusData(session *mgo.Session) (*ReplicaSetStatus, error) { - replSetStatus := &ReplicaSetStatus{} - - err := session.DB("admin").Run(bson.D{{ "replSetGetStatus", 1 }}, &replSetStatus) +// Export exports the replSetGetStatus stati to be consumed by prometheus +func (replStatus *ReplSetStatus) Export(ch chan<- prometheus.Metric) { + myState.WithLabelValues(replStatus.Set).Set(float64(replStatus.MyState)) + + // new in version 3.2 + if replStatus.Term != nil { + term.WithLabelValues(replStatus.Set).Set(float64(*replStatus.Term)) + } + numberOfMembers.WithLabelValues(replStatus.Set).Set(float64(len(replStatus.Members))) + + // new in version 3.2 + if replStatus.HeartbeatIntervalMillis != nil { + heartbeatIntervalMillis.WithLabelValues(replStatus.Set).Set(*replStatus.HeartbeatIntervalMillis) + } + + for _, member := range replStatus.Members { + ls := prometheus.Labels{ + "set": replStatus.Set, + "name": member.Name, + "state": member.StateStr, + } + + memberState.With(ls).Set(float64(member.State)) + + // ReplSetStatus.Member.Health is not available on the node you're connected to + if member.Health != nil { + memberHealth.With(ls).Set(float64(*member.Health)) + } + + memberUptime.With(ls).Set(member.Uptime) + + memberOptimeDate.With(ls).Set(float64(member.OptimeDate.Unix())) + + // ReplSetGetStatus.Member.ElectionTime is only available on the PRIMARY + if member.ElectionDate != nil { + memberElectionDate.With(ls).Set(float64((*member.ElectionDate).Unix())) + } + if member.LastHeartbeat != nil { + memberLastHeartbeat.With(ls).Set(float64((*member.LastHeartbeat).Unix())) + } + if member.LastHeartbeatRecv != nil { + memberLastHeartbeatRecv.With(ls).Set(float64((*member.LastHeartbeatRecv).Unix())) + } + if member.PingMs != nil { + memberPingMs.With(ls).Set(*member.PingMs) + } + if member.ConfigVersion != nil { + memberConfigVersion.With(ls).Set(float64(*member.ConfigVersion)) + } + } + // collect metrics + myState.Collect(ch) + term.Collect(ch) + numberOfMembers.Collect(ch) + heartbeatIntervalMillis.Collect(ch) + memberState.Collect(ch) + memberHealth.Collect(ch) + memberUptime.Collect(ch) + memberOptimeDate.Collect(ch) + memberElectionDate.Collect(ch) + memberLastHeartbeat.Collect(ch) + memberLastHeartbeatRecv.Collect(ch) + memberPingMs.Collect(ch) + memberConfigVersion.Collect(ch) - return replSetStatus, err } -func GetReplSetSelf(status *ReplicaSetStatus) *ReplicaSetMemberStatus { - result := &ReplicaSetMemberStatus{} - - for _, member := range status.Members { - if member.Self == true { - result = &member - break - } - } +// Describe describes the replSetGetStatus metrics for prometheus +func (replStatus *ReplSetStatus) Describe(ch chan<- *prometheus.Desc) { + myState.Describe(ch) + term.Describe(ch) + numberOfMembers.Describe(ch) + heartbeatIntervalMillis.Describe(ch) + memberState.Describe(ch) + memberHealth.Describe(ch) + memberUptime.Describe(ch) + memberOptimeDate.Describe(ch) + memberElectionDate.Describe(ch) + memberLastHeartbeatRecv.Describe(ch) + memberPingMs.Describe(ch) + memberConfigVersion.Describe(ch) - return result } -func GetReplSetPrimary(status *ReplicaSetStatus) *ReplicaSetMemberStatus { - result := &ReplicaSetMemberStatus{} - - for _, member := range status.Members { - if member.State == 1 { - result = &member - break - } - } - - return result -} - -func GetReplSetMemberByName(status *ReplicaSetStatus, name string) *ReplicaSetMemberStatus { - result := &ReplicaSetMemberStatus{} - - for _, member := range status.Members { - if member.Name == name { - result = &member - break - } - } - - return result -} - -func GetReplSetSyncingTo(status *ReplicaSetStatus) *ReplicaSetMemberStatus { - myInfo := GetReplSetSelf(status) - if len(myInfo.SyncingTo) > 0 { - return GetReplSetMemberByName(status, myInfo.SyncingTo) - } else { - return GetReplSetPrimary(status) - } -} - -func GetReplSetMemberCount(status *ReplicaSetStatus) (float64) { - var result float64 = 0 - - if status.Members != nil { - result = float64(len(status.Members)) - } - - return result -} - -func GetReplSetMembersWithDataCount(status *ReplicaSetStatus) (float64) { - var membersWithDataCount int = 0 - - if status.Members != nil { - for _, member := range status.Members { - if member.Health == 1 { - if member.State == 1 || member.State == 2 { - membersWithDataCount = membersWithDataCount + 1 - } - } - } - } - - return float64(membersWithDataCount) -} - -func GetReplSetMaxNode2NodePingMs(status *ReplicaSetStatus) (float64) { - var maxNodePingMs float64 = -1 - - for _, member := range status.Members { - if &member.PingMs != nil { - if member.PingMs > maxNodePingMs { - maxNodePingMs = member.PingMs - } - } - } - - return maxNodePingMs -} - -func GetReplSetLagMs(status *ReplicaSetStatus) (float64) { - memberInfo := GetReplSetSelf(status) - - // short-circuit the check if you're the Primary - if memberInfo.State == 1 { - return 0 - } - - var result float64 = -1 - optimeNanoSelf := memberInfo.OptimeDate.UnixNano() - replSetStatusPrimary := GetReplSetSyncingTo(status) - if &replSetStatusPrimary.OptimeDate != nil { - optimeNanoPrimary := replSetStatusPrimary.OptimeDate.UnixNano() - result = float64(optimeNanoPrimary - optimeNanoSelf)/1000000 - } - - return result -} - -func GetReplSetLastElectionUnixTime(status *ReplicaSetStatus) (float64) { - replSetPrimary := GetReplSetPrimary(status) - - var result float64 = -1 - if &replSetPrimary.ElectionDate != nil { - result = float64(replSetPrimary.ElectionDate.Unix()) - } - - return result -} - -func(summary *ReplicaSetStatusSummary) Export(ch chan<- prometheus.Metric) { - replSetTotalMembers.Set(summary.Members) - replSetTotalMembersWithData.Set(summary.MembersWithData) - replSetMyLagMs.Set(summary.LagMs) - replSetMaxNode2NodePingMs.Set(summary.MaxNode2NodePingMs) - replSetLastElection.WithLabelValues("election").Set(summary.LastElection) - - replSetTotalMembers.Collect(ch) - replSetTotalMembersWithData.Collect(ch) - replSetMyLagMs.Collect(ch) - replSetMaxNode2NodePingMs.Collect(ch) - replSetLastElection.Collect(ch) -} - -func GetReplSetStatus(session *mgo.Session) *ReplicaSetStatusSummary { - status, err := GetReplSetStatusData(session) - if err != nil { - glog.Error("Could not get replset status!") - } - - summary := &ReplicaSetStatusSummary{} - summary.Members = GetReplSetMemberCount(status) - summary.MembersWithData = GetReplSetMembersWithDataCount(status) - summary.LastElection = GetReplSetLastElectionUnixTime(status) - summary.LagMs = GetReplSetLagMs(status) - summary.MaxNode2NodePingMs = GetReplSetMaxNode2NodePingMs(status) - - return summary +// GetReplSetStatus returns the replica status info +func GetReplSetStatus(session *mgo.Session) *ReplSetStatus { + result := &ReplSetStatus{} + err := session.DB("admin").Run(bson.D{{"replSetGetStatus", 1}}, result) + if err != nil { + glog.Error("Failed to get replSet status.") + return nil + } + return result } diff --git a/collector/mongod/server_status.go b/collector/mongod/server_status.go index a39ee1c77..29dcc1906 100644 --- a/collector/mongod/server_status.go +++ b/collector/mongod/server_status.go @@ -60,6 +60,8 @@ type ServerStatus struct { Metrics *MetricsStats `bson:"metrics"` Cursors *Cursors `bson:"cursors"` + + WiredTiger *WiredTigerStats `bson:"wiredTiger"` } // Export exports the server status to be consumed by prometheus. @@ -67,6 +69,9 @@ func (status *ServerStatus) Export(ch chan<- prometheus.Metric) { instanceUptimeSeconds.Set(status.Uptime) instanceUptimeEstimateSeconds.Set(status.Uptime) instanceLocalTime.Set(float64(status.LocalTime.Unix())) + instanceUptimeSeconds.Collect(ch) + instanceUptimeEstimateSeconds.Collect(ch) + instanceLocalTime.Collect(ch) if status.Asserts != nil { status.Asserts.Export(ch) @@ -110,6 +115,9 @@ func (status *ServerStatus) Export(ch chan<- prometheus.Metric) { if status.Cursors != nil { status.Cursors.Export(ch) } + if status.WiredTiger != nil { + status.WiredTiger.Export(ch) + } } // Describe describes the server status for prometheus. @@ -160,15 +168,14 @@ func (status *ServerStatus) Describe(ch chan<- *prometheus.Desc) { if status.Cursors != nil { status.Cursors.Describe(ch) } + if status.WiredTiger != nil { + status.WiredTiger.Describe(ch) + } } // GetServerStatus returns the server status info. func GetServerStatus(session *mgo.Session) *ServerStatus { result := &ServerStatus{} - - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - err := session.DB("admin").Run(bson.D{{"serverStatus", 1}, {"recordStats", 0}}, result) if err != nil { glog.Error("Failed to get server status.") diff --git a/collector/mongod/server_status_test.go b/collector/mongod/server_status_test.go index a4d50de62..d4094c1a9 100644 --- a/collector/mongod/server_status_test.go +++ b/collector/mongod/server_status_test.go @@ -1,17 +1,10 @@ package collector_mongod import ( - "gopkg.in/mgo.v2/bson" "testing" -) -func Test_ServerStatusCollectData(t *testing.T) { - data := LoadFixture("server_status.bson") - serverStatus := &ServerStatus{} - loadServerStatusFromBson(data, serverStatus) - - serverStatus.Export() -} + "gopkg.in/mgo.v2/bson" +) func Test_ParserServerStatus(t *testing.T) { data := LoadFixture("server_status.bson") diff --git a/collector/mongod/wiredtiger.go b/collector/mongod/wiredtiger.go new file mode 100644 index 000000000..76187d7ff --- /dev/null +++ b/collector/mongod/wiredtiger.go @@ -0,0 +1,403 @@ +package collector_mongod + +import( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + wtBlockManagerBlocksTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_blockmanager", + Name: "blocks_total", + Help: "TBD", + }, []string{"type"}) + wtBlockManagerBytesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_blockmanager", + Name: "bytes_total", + Help: "TBD", + }, []string{"type"}) +) + +var ( + wtCachePagesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "pages_total", + Help: "TBD", + }, []string{"type"}) + wtCacheBytesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "bytes_total", + Help: "TBD", + }, []string{"type"}) + wtCacheEvictedTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "evicted_total", + Help: "TBD", + }, []string{"type"}) + wtCacheCurPages = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "current_pages", + Help: "TBD", + }) + wtCacheBytesCached = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "bytes_cached", + Help: "TBD", + }) + wtCacheBytesMax = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "bytes_max", + Help: "TBD", + }) + wtCachePercentOverhead = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_cache", + Name: "percent_overhead", + Help: "TBD", + }) +) + +var( + wtTransactionsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_transactions", + Name: "total", + Help: "TBD", + }, []string{"type"}) + wtTransactionsTotalCheckpointMs = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_transactions", + Name: "total_chkp_ms", + Help: "TBD", + }) + wtTransactionsCheckpointsRunning = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_transactions", + Name: "chkp_running", + Help: "TBD", + }) +) + +var( + wtConcurrentTransactionsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_concur_transactions", + Name: "out", + Help: "TBD", + }, []string{"type"}) + wtConcurrentTransactionsAvailable = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_concur_transactions", + Name: "available", + Help: "TBD", + }, []string{"type"}) + wtConcurrentTransactionsTotalTickets = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_concur_transactions", + Name: "tickets_total", + Help: "TBD", + }, []string{"type"}) +) + +var( + wtAsyncWorkQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_async", + Name: "work_queue_length", + Help: "TBD", + }) + wtAsyncMaxWorkQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "wiredtiger_async", + Name: "max_work_queue_length", + Help: "TBD", + }) +) + +// async stats +type WTAsyncStats struct { + NumAllocStateRaces float64 `bson:"number of allocation state races"` + NumOpSlotsViewedForAlloc float64 `bson:"number of operation slots viewed for allocation"` + WorkQueueLength float64 `bson:"current work queue length"` + NumFlushCalls float64 `bson:"number of flush calls"` + NumAllocFailed float64 `bson:"number of times operation allocation failed"` + MaxWorkQueueLength float64 `bson:"maximum work queue length"` + NumWorkerNoWork float64 `bson:"number of times worker found no work"` + TotalAlloc float64 `bson:"total allocations"` + TotalCompact float64 `bson:"total compact calls"` + TotalInsert float64 `bson:"total insert calls"` + TotalRemove float64 `bson:"total remove calls"` + TotalSearch float64 `bson:"total search calls"` + TotalUpdate float64 `bson:"total update calls"` +} + +func (stats *WTAsyncStats) Export(ch chan<- prometheus.Metric) { + wtAsyncWorkQueueLength.Set(stats.WorkQueueLength) + wtAsyncMaxWorkQueueLength.Set(stats.MaxWorkQueueLength) +} + +func (stats *WTAsyncStats) Describe(ch chan<- *prometheus.Desc) { + wtAsyncWorkQueueLength.Describe(ch) + wtAsyncMaxWorkQueueLength.Describe(ch) +} + +// blockmanager stats +type WTBlockManagerStats struct { + MappedBytesRead float64 `bson:"mapped bytes read"` + BytesRead float64 `bson:"bytes read"` + BytesWritten float64 `bson:"bytes written"` + MappedBlocksRead float64 `bson:"mapped blocks read"` + BlocksPreLoaded float64 `bson:"blocks pre-loaded"` + BlocksRead float64 `bson:"blocks read"` + BlocksWritten float64 `bson:"blocks written"` +} + +func (stats *WTBlockManagerStats) Export(ch chan<- prometheus.Metric) { + wtBlockManagerBlocksTotal.WithLabelValues("read").Set(stats.BlocksRead) + wtBlockManagerBlocksTotal.WithLabelValues("read_mapped").Set(stats.MappedBlocksRead) + wtBlockManagerBlocksTotal.WithLabelValues("written").Set(stats.BlocksWritten) + wtBlockManagerBytesTotal.WithLabelValues("read").Set(stats.BytesRead) + wtBlockManagerBytesTotal.WithLabelValues("read_mapped").Set(stats.MappedBytesRead) + wtBlockManagerBytesTotal.WithLabelValues("written").Set(stats.BytesWritten) +} + +func (stats *WTBlockManagerStats) Describe(ch chan<- *prometheus.Desc) { + wtBlockManagerBlocksTotal.Describe(ch) + wtBlockManagerBytesTotal.Describe(ch) +} + +// cache stats +type WTCacheStats struct { + BytesCached float64 `bson:"bytes currently in the cache"` + BytesMaximum float64 `bson:"maximum bytes configured"` + BytesReadInto float64 `bson:"bytes read into cache"` + BytesWrittenFrom float64 `bson:"bytes written from cache"` + EvictedUnmodified float64 `bson:"unmodified pages evicted"` + EvictedModified float64 `bson:"modified pages evicted"` + PercentOverhead float64 `bson:"percentage overhead"` + PagesTotal float64 `bson:"pages currently held in the cache"` + PagesReadInto float64 `bson:"pages read into cache"` + PagesWrittenFrom float64 `bson:"pages written from cache"` +} + +func (stats *WTCacheStats) Export(ch chan<- prometheus.Metric) { + wtCachePagesTotal.WithLabelValues("read").Set(stats.PagesReadInto) + wtCachePagesTotal.WithLabelValues("written").Set(stats.PagesWrittenFrom) + wtCacheBytesTotal.WithLabelValues("read").Set(stats.BytesReadInto) + wtCacheBytesTotal.WithLabelValues("written").Set(stats.BytesWrittenFrom) + wtCacheEvictedTotal.WithLabelValues("modified").Set(stats.EvictedModified) + wtCacheEvictedTotal.WithLabelValues("unmodified").Set(stats.EvictedUnmodified) + wtCacheCurPages.Set(stats.PagesTotal) + wtCacheBytesCached.Set(stats.BytesCached) + wtCacheBytesMax.Set(stats.BytesMaximum) + wtCachePercentOverhead.Set(stats.PercentOverhead) +} + +func (stats *WTCacheStats) Describe(ch chan<- *prometheus.Desc) { + wtCachePagesTotal.Describe(ch) + wtCacheEvictedTotal.Describe(ch) + wtCacheCurPages.Describe(ch) + wtCacheBytesCached.Describe(ch) + wtCacheBytesMax.Describe(ch) + wtCachePercentOverhead.Describe(ch) +} + +// connection stats +type WTConnectionStats struct { + OpenFiles float64 `bson:"files currently open"` + // the slash in "I/Os" breaks bson's flag parser (fixme) + //TotalReadIOs float64 `bson:"total read I/Os"` + //TotalWriteIOs float64 `bson:"total write I/Os"` +} + +// cursor stats +type WTCursorStats struct { + CreateCalls float64 `bson:"cursor create calls"` + InsertCalls float64 `bson:"cursor insert calls"` + NextCalls float64 `bson:"cursor next calls"` + PrevCalls float64 `bson:"cursor prev calls"` + RemoveCalls float64 `bson:"cursor remove calls"` + ResetCalls float64 `bson:"cursor reset calls"` + SearchCalls float64 `bson:"cursor search calls"` + SearchNearCalls float64 `bson:"cursor search near calls"` + UpdateCalls float64 `bson:"cursor update calls"` +} + +// log stats +type WTLogStats struct { + TotalBufferSize float64 `bson:"total log buffer size"` + BytesPayloadData float64 `bson:"log bytes of payload data"` + BytesWritten float64 `bson:"log bytes written"` + RecordsUncompressed float64 `bson:"log records not compressed"` + RecordsCompressed float64 `bson:"log records compressed"` + LogFlushes float64 `bson:"log flush operations"` + MaxLogSize float64 `bson:"maximum log file size"` + LogReads float64 `bson:"log read operations"` + LogScansDouble float64 `bson:"log scan records requiring two reads"` + LogScans float64 `bson:"log scan operations"` + LogSyncs float64 `bson:"log sync operations"` + LogSyncDirs float64 `bson:"log sync_dir operations"` + LogWrites float64 `bson:"log write operations"` +} + +// session stats +type WTSessionStats struct { + Cursors float64 `bson:"open cursor count"` + Sessions float64 `bson:"open session count"` +} + +// transaction stats +type WTTransactionStats struct { + Begins float64 `bson:"transaction begins"` + Checkpoints float64 `bson:"transaction checkpoints"` + CheckpointsRunning float64 `bson:"transaction checkpoint currently running"` + CheckpointMaxMs float64 `bson:"transaction checkpoint max time (msecs)"` + CheckpointMinMs float64 `bson:"transaction checkpoint min time (msecs)"` + CheckpointLastMs float64 `bson:"transaction checkpoint most recent time (msecs)"` + CheckpointTotalMs float64 `bson:"transaction checkpoint total time (msecs)"` + Committed float64 `bson:"transactions committed"` + CacheOverflowFailure float64 `bson:"transaction failures due to cache overflow"` + RolledBack float64 `bson:"transactions rolled back"` +} + +func (stats *WTTransactionStats) Export(ch chan<- prometheus.Metric) { + wtTransactionsTotal.WithLabelValues("begins").Set(stats.Begins) + wtTransactionsTotal.WithLabelValues("checkpoints").Set(stats.Checkpoints) + wtTransactionsTotal.WithLabelValues("committed").Set(stats.Committed) + wtTransactionsTotal.WithLabelValues("rolledback").Set(stats.RolledBack) + wtTransactionsTotalCheckpointMs.Set(stats.CheckpointTotalMs) + wtTransactionsCheckpointsRunning.Set(stats.CheckpointsRunning) +} + +func (stats *WTTransactionStats) Describe(ch chan<- *prometheus.Desc) { + wtTransactionsTotal.Describe(ch) + wtTransactionsTotalCheckpointMs.Describe(ch) + wtTransactionsCheckpointsRunning.Describe(ch) +} + +// concurrenttransaction stats +type WTConcurrentTransactionsTypeStats struct { + Out float64 `bson:"out"` + Available float64 `bson:"available"` + TotalTickets float64 `bson:"totalTickets"` +} + +type WTConcurrentTransactionsStats struct { + Write *WTConcurrentTransactionsTypeStats `bson:"read"` + Read *WTConcurrentTransactionsTypeStats `bson:"write"` +} + +func (stats *WTConcurrentTransactionsStats) Export(ch chan<- prometheus.Metric) { + wtConcurrentTransactionsOut.WithLabelValues("read").Set(stats.Read.Out) + wtConcurrentTransactionsOut.WithLabelValues("write").Set(stats.Write.Out) + wtConcurrentTransactionsAvailable.WithLabelValues("read").Set(stats.Read.Available) + wtConcurrentTransactionsAvailable.WithLabelValues("write").Set(stats.Write.Available) + wtConcurrentTransactionsTotalTickets.WithLabelValues("read").Set(stats.Read.TotalTickets) + wtConcurrentTransactionsTotalTickets.WithLabelValues("write").Set(stats.Write.TotalTickets) +} + +func (stats *WTConcurrentTransactionsStats) Describe(ch chan<- *prometheus.Desc) { + wtConcurrentTransactionsOut.Describe(ch) + wtConcurrentTransactionsAvailable.Describe(ch) + wtConcurrentTransactionsTotalTickets.Describe(ch) +} + +// WiredTiger stats +type WiredTigerStats struct { + Async *WTAsyncStats `bson:"async"` + BlockManager *WTBlockManagerStats `bson:"block-manager"` + Cache *WTCacheStats `bson:"cache"` + Connection *WTConnectionStats `bson:"connection"` + Cursor *WTCursorStats `bson:"cursor"` + Log *WTLogStats `bson:"log"` + Session *WTSessionStats `bson:"session"` + Transaction *WTTransactionStats `bson:"transaction"` + ConcurrentTransactions *WTConcurrentTransactionsStats `bson:"concurrentTransactions"` +} + +func (stats *WiredTigerStats) Describe(ch chan<- *prometheus.Desc) { + if stats.Async != nil { + stats.Async.Describe(ch) + } + if stats.BlockManager != nil { + stats.BlockManager.Describe(ch) + } + if stats.Cache != nil { + stats.Cache.Describe(ch) + } + if stats.Transaction != nil { + stats.Transaction.Describe(ch) + } + if stats.ConcurrentTransactions != nil { + stats.ConcurrentTransactions.Describe(ch) + } + + wtBlockManagerBlocksTotal.Describe(ch) + wtBlockManagerBytesTotal.Describe(ch) + + wtCachePagesTotal.Describe(ch) + wtCacheBytesTotal.Describe(ch) + wtCacheEvictedTotal.Describe(ch) + wtCacheCurPages.Describe(ch) + wtCacheBytesCached.Describe(ch) + wtCacheBytesMax.Describe(ch) + wtCachePercentOverhead.Describe(ch) + + wtTransactionsTotal.Describe(ch) + wtTransactionsTotalCheckpointMs.Describe(ch) + wtTransactionsCheckpointsRunning.Describe(ch) + + wtConcurrentTransactionsOut.Describe(ch) + wtConcurrentTransactionsAvailable.Describe(ch) + wtConcurrentTransactionsTotalTickets.Describe(ch) + + wtAsyncWorkQueueLength.Describe(ch) + wtAsyncMaxWorkQueueLength.Describe(ch) +} + +func (stats *WiredTigerStats) Export(ch chan<- prometheus.Metric) { + if stats.Async != nil { + stats.Async.Export(ch) + } + if stats.BlockManager != nil { + stats.BlockManager.Export(ch) + } + if stats.Cache != nil { + stats.Cache.Export(ch) + } + if stats.Transaction != nil { + stats.Transaction.Export(ch) + } + if stats.ConcurrentTransactions != nil { + stats.ConcurrentTransactions.Export(ch) + } + + wtBlockManagerBlocksTotal.Collect(ch) + wtBlockManagerBytesTotal.Collect(ch) + + wtCachePagesTotal.Collect(ch) + wtCacheBytesTotal.Collect(ch) + wtCacheEvictedTotal.Collect(ch) + wtCacheCurPages.Collect(ch) + wtCacheBytesCached.Collect(ch) + wtCacheBytesMax.Collect(ch) + wtCachePercentOverhead.Collect(ch) + + wtTransactionsTotal.Collect(ch) + wtTransactionsTotalCheckpointMs.Collect(ch) + wtTransactionsCheckpointsRunning.Collect(ch) + + wtConcurrentTransactionsOut.Collect(ch) + wtConcurrentTransactionsAvailable.Collect(ch) + wtConcurrentTransactionsTotalTickets.Collect(ch) + + wtAsyncWorkQueueLength.Collect(ch) + wtAsyncMaxWorkQueueLength.Collect(ch) +} diff --git a/collector/mongodb_collector.go b/collector/mongodb_collector.go index 044094636..4f851b958 100644 --- a/collector/mongodb_collector.go +++ b/collector/mongodb_collector.go @@ -1,16 +1,12 @@ package collector import ( - //"github.com/dcu/mongodb_exporter/shared" - "github.com/dcu/mongodb_exporter/collector/mongod" - "github.com/dcu/mongodb_exporter/collector/mongos" - "github.com/dcu/mongodb_exporter/collector/shared" + "github.com/Percona-Lab/prometheus_mongodb_exporter/shared" + "github.com/Percona-Lab/prometheus_mongodb_exporter/collector/mongod" + "github.com/Percona-Lab/prometheus_mongodb_exporter/collector/mongos" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" "gopkg.in/mgo.v2" - "fmt" - "os" - "regexp" ) var ( @@ -40,146 +36,80 @@ func NewMongodbCollector(opts MongodbCollectorOpts) *MongodbCollector { // Describe describes all mongodb's metrics. func (exporter *MongodbCollector) Describe(ch chan<- *prometheus.Desc) { glog.Info("Describing groups") - session, err := connectMongo(exporter.Opts.URI) - if err != nil{ - return - } - serverStatus := collector_mongos.GetServerStatus(session) - if serverStatus != nil { - serverStatus.Describe(ch) - } + session := shared.MongoSession(exporter.Opts.URI) defer session.Close() -} - -func connectMongo(uri string)(*mgo.Session, error) { - r, _ := regexp.Compile("connect=direct") - if r.MatchString(uri) != true { - r, _ := regexp.Compile("\\?") - if r.MatchString(uri) == true { - uri = uri + "&connect=direct" - } else { - uri = uri + "?connect=direct" - } - } - session, err := mgo.Dial(uri) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %v\n", err) - os.Exit(1) - glog.Errorf("Cannot connect to server using url: %s", uri) - return nil,err - } - - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - - err = nil - return session,err -} - -// GetNodeType checks if the connected Session is a mongos, standalone, or replset, -// by looking at the result of calling isMaster. -func GetNodeType(session *mgo.Session)(string, error) { - masterDoc := struct { - SetName interface{} `bson:"setName"` - Hosts interface{} `bson:"hosts"` - Msg string `bson:"msg"` - }{} - err := session.Run("isMaster", &masterDoc) - if err != nil { - glog.Info("Got unknown node type\n") - return "unknown", err + if session != nil { + serverStatus := collector_mongos.GetServerStatus(session) + if serverStatus != nil { + serverStatus.Describe(ch) + } } - - if masterDoc.SetName != nil || masterDoc.Hosts != nil { - glog.Info("Got replset node type") - return "replset", nil - } else if masterDoc.Msg == "isdbgrid" { - glog.Info("Got mongos node type\n") - // isdbgrid is always the msg value when calling isMaster on a mongos - // see http://docs.mongodb.org/manual/core/sharded-cluster-query-router/ - return "mongos", nil - } - glog.Info("defaulted to mongod node type\n") - return "mongod", nil } // Collect collects all mongodb's metrics. func (exporter *MongodbCollector) Collect(ch chan<- prometheus.Metric) { - glog.Info("Collecting Server Status from: ", exporter.Opts.URI) - session, err := connectMongo(exporter.Opts.URI) - if err != nil{ - glog.Error(fmt.Printf("We failed to connect to mongo with error of %s\n", err)) - } - - version := collector_shared.GetServerVersion(session) - glog.Info("Connected to: ", exporter.Opts.URI, ", server version: ", version) - - nodeType,err := GetNodeType(session) - if err != nil{ - glog.Error("We run had a node type error of %s\n", err) - } - //glog.Info("Passed nodeType with %s", nodeType) - switch { - case nodeType == "mongos": - serverStatus := collector_mongos.GetServerStatus(session) - if serverStatus != nil { - serverStatus.Export(ch) - } - balancerStatus := collector_mongos.GetBalancerStatus(session) - if balancerStatus != nil { - balancerStatus.Export(ch) - } - balancerTopoStatus := collector_mongos.GetBalancerTopoStatus(session) - if balancerTopoStatus != nil { - balancerTopoStatus.Export(ch) - } - balancerChangelogStatus := collector_mongos.GetBalancerChangelogStatus(session) - if balancerChangelogStatus != nil { - balancerChangelogStatus.Export(ch) - } - case nodeType == "mongod": - serverStatus := collector_mongod.GetServerStatus(session) - if serverStatus != nil { - serverStatus.Export(ch) - } - case nodeType == "replset": - serverStatus := collector_mongod.GetServerStatus(session) - if serverStatus != nil { - serverStatus.Export(ch) - } - oplogStatus := collector_mongod.GetOplogStatus(session) - if oplogStatus != nil { - oplogStatus.Export(ch) - } - replSetStatus := collector_mongod.GetReplSetStatus(session) - if replSetStatus != nil { - replSetStatus.Export(ch) - } - default: - glog.Info("No process for current node type no metrics printing!\n") - } - session.Close() - //exporter.collectMongodServerStatus(ch) - //exporter.collectMongosServerStatus(ch) + mongoSess := shared.MongoSession(exporter.Opts.URI) + defer mongoSess.Close() + if mongoSess != nil { + serverVersion, err := shared.MongoSessionServerVersion(mongoSess) + if err != nil { + glog.Errorf("Problem gathering the mongo server version: %s", err) + } + + nodeType, err := shared.MongoSessionNodeType(mongoSess) + if err != nil { + glog.Errorf("Problem gathering the mongo node type: %s", err) + } + + glog.Infof("Connected to: %s (node type: %s, server version: %s)", exporter.Opts.URI, nodeType, serverVersion) + switch { + case nodeType == "mongos": + exporter.collectMongos(mongoSess, ch) + case nodeType == "mongod": + exporter.collectMongod(mongoSess, ch) + case nodeType == "replset": + exporter.collectMongodReplSet(mongoSess, ch) + default: + glog.Infof("Unrecognized node type %s!", nodeType) + } + } } -/** -func (exporter *MongodbCollector) collectMongodServerStatus(ch chan<- prometheus.Metric) *collector_mongod.ServerStatus { - serverStatus := collector_mongod.GetServerStatus(exporter.Opts.URI) +func (exporter *MongodbCollector) collectMongos(session *mgo.Session, ch chan<- prometheus.Metric) { + glog.Info("Collecting Server Status") + serverStatus := collector_mongos.GetServerStatus(session) if serverStatus != nil { serverStatus.Export(ch) } - return serverStatus + glog.Info("Collecting Sharding Status") + shardingStatus := collector_mongos.GetShardingStatus(session) + if shardingStatus != nil { + shardingStatus.Export(ch) + } } -func (exporter *MongodbCollector) collectMongosServerStatus(ch chan<- prometheus.Metric) *collector_mongos.ServerStatus { - serverStatus := collector_mongos.GetServerStatus(exporter.Opts.URI) - +func (exporter *MongodbCollector) collectMongod(session *mgo.Session, ch chan<- prometheus.Metric) { + glog.Info("Collecting Server Status") + serverStatus := collector_mongod.GetServerStatus(session) if serverStatus != nil { serverStatus.Export(ch) } +} + +func (exporter *MongodbCollector) collectMongodReplSet(session *mgo.Session, ch chan<- prometheus.Metric) { + exporter.collectMongod(session, ch) - return serverStatus + glog.Info("Collecting Replset Status") + replSetStatus := collector_mongod.GetReplSetStatus(session) + if replSetStatus != nil { + replSetStatus.Export(ch) + } + + glog.Info("Collecting Replset Oplog Status") + oplogStatus := collector_mongod.GetOplogStatus(session) + if oplogStatus != nil { + oplogStatus.Export(ch) + } } -**/ + diff --git a/collector/mongodb_collector_test.go b/collector/mongodb_collector_test.go index 7eaa7e269..45e6184c8 100644 --- a/collector/mongodb_collector_test.go +++ b/collector/mongodb_collector_test.go @@ -1,19 +1,16 @@ package collector import ( - "github.com/dcu/mongodb_exporter/shared" - "github.com/prometheus/client_golang/prometheus" "testing" + + "github.com/Percona-Lab/prometheus_mongodb_exporter/shared" + "github.com/prometheus/client_golang/prometheus" ) func Test_CollectServerStatus(t *testing.T) { shared.ParseEnabledGroups("assers,durability,backgrond_flushing,connections,extra_info,global_lock,index_counters,network,op_counters,memory,locks,metrics,cursors") collector := NewMongodbCollector(MongodbCollectorOpts{URI: "localhost"}) - serverStatus := collector.collectServerStatus(nil) - - if serverStatus.Asserts == nil { - t.Error("Error loading document.") - } + go collector.Collect(nil) } func Test_DescribeCollector(t *testing.T) { @@ -29,16 +26,3 @@ func Test_CollectCollector(t *testing.T) { ch := make(chan prometheus.Metric) go collector.Collect(ch) } - -func Test_InvalidConnection(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - - collector := NewMongodbCollector(MongodbCollectorOpts{URI: "s://localhost:123"}) - serverStatus := collector.collectServerStatus(nil) - - if serverStatus != nil { - t.Fail() - } -} diff --git a/collector/mongos/asserts_test.go b/collector/mongos/asserts_test.go deleted file mode 100644 index 43372624f..000000000 --- a/collector/mongos/asserts_test.go +++ /dev/null @@ -1,17 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_AssertsCollectData(t *testing.T) { - asserts := &AssertsStats{ - Regular: 1, - Warning: 2, - Msg: 3, - User: 4, - Rollovers: 5, - } - - asserts.Export() -} diff --git a/collector/mongos/balancer_changelog.go b/collector/mongos/balancer_changelog.go deleted file mode 100644 index d40e03763..000000000 --- a/collector/mongos/balancer_changelog.go +++ /dev/null @@ -1,111 +0,0 @@ -package collector_mongos - -import ( - "time" - "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/mgo.v2" - "gopkg.in/mgo.v2/bson" -) - -var ( - balancerChangelogInfo = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: Namespace, - Name: "balancer_changelog", - Help: "Log event statistics for the MongoDB balancer", - }, []string{"event"}) -) - -type BalancerChangelogAggregationId struct { - Event string `bson:"event"` - Note string `bson:"note"` -} - -type BalancerChangelogAggregationResult struct { - Id *BalancerChangelogAggregationId `bson:"_id"` - Count float64 `bson:"count"` -} - -type BalancerChangelogStats struct { - MoveChunkStart float64 - MoveChunkFromSuccess float64 - MoveChunkFromFailed float64 - MoveChunkToSuccess float64 - MoveChunkToFailed float64 - MoveChunkCommit float64 - Split float64 - MultiSplit float64 - ShardCollection float64 - ShardCollectionStart float64 - AddShard float64 -} - -func GetBalancerChangelogStats24hr(session *mgo.Session, showErrors bool) *BalancerChangelogStats { - var qresults []BalancerChangelogAggregationResult - coll := session.DB("config").C("changelog") - match := bson.M{ "time" : bson.M{ "$gt" : time.Now().Add(-24 * time.Hour) } } - group := bson.M{ "_id" : bson.M{ "event" : "$what", "note" : "$details.note" }, "count" : bson.M{ "$sum" : 1 } } - - err := coll.Pipe([]bson.M{ { "$match" : match }, { "$group" : group } }).All(&qresults) - if err != nil { - glog.Error("Error executing aggregation on 'config.changelog'!") - } - - results := &BalancerChangelogStats{} - for _, stat := range qresults { - event := stat.Id.Event - note := stat.Id.Note - count := stat.Count - if event == "moveChunk.start" { - results.MoveChunkStart = count - } else if event == "moveChunk.to" { - if note == "success" { - results.MoveChunkToSuccess = count - } else { - results.MoveChunkToFailed = count - } - } else if event == "moveChunk.from" { - if note == "success" { - results.MoveChunkFromSuccess = count - } else { - results.MoveChunkFromFailed = count - } - } else if event == "moveChunk.commit" { - results.MoveChunkCommit = count - } else if event == "addShard" { - results.AddShard = count - } else if event == "shardCollection" { - results.ShardCollection = count - } else if event == "shardCollection.start" { - results.ShardCollectionStart = count - } else if event == "split" { - results.Split = count - } else if event == "multi-split" { - results.MultiSplit = count - } - } - - return results -} - -func (status *BalancerChangelogStats) Export(ch chan<- prometheus.Metric) { - balancerChangelogInfo.WithLabelValues("moveChunk.start").Set(status.MoveChunkStart) - balancerChangelogInfo.WithLabelValues("moveChunk.to").Set(status.MoveChunkToSuccess) - balancerChangelogInfo.WithLabelValues("moveChunk.to_failed").Set(status.MoveChunkToFailed) - balancerChangelogInfo.WithLabelValues("moveChunk.from").Set(status.MoveChunkFromSuccess) - balancerChangelogInfo.WithLabelValues("moveChunk.from_failed").Set(status.MoveChunkFromFailed) - balancerChangelogInfo.WithLabelValues("moveChunk.commit").Set(status.MoveChunkCommit) - balancerChangelogInfo.WithLabelValues("addShard").Set(status.AddShard) - balancerChangelogInfo.WithLabelValues("shardCollection").Set(status.ShardCollection) - balancerChangelogInfo.WithLabelValues("shardCollection.start").Set(status.ShardCollectionStart) - balancerChangelogInfo.WithLabelValues("split").Set(status.Split) - balancerChangelogInfo.WithLabelValues("multi-split").Set(status.MultiSplit) - balancerChangelogInfo.Collect(ch) -} - -func GetBalancerChangelogStatus(session *mgo.Session) *BalancerChangelogStats { - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - results := GetBalancerChangelogStats24hr(session, false) - return results -} diff --git a/collector/mongos/balancer_status.go b/collector/mongos/balancer_status.go deleted file mode 100644 index b563bee56..000000000 --- a/collector/mongos/balancer_status.go +++ /dev/null @@ -1,112 +0,0 @@ -package collector_mongos - -import ( - "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/mgo.v2" - "gopkg.in/mgo.v2/bson" -) - -var ( - balancerIsEnabled = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer", - Name: "is_enabled", - Help: "Boolean reporting if cluster balancer is enabled", - }) - balancerIsBalanced = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer", - Name: "is_balanced", - Help: "Boolean reporting if cluster chunks are balanced across shards", - }) -) - -func IsBalancerEnabled(session *mgo.Session) (float64) { - var balancerConfig map[string]interface{} - session.DB("config").C("settings").Find(bson.M{ "_id" : "balancer" }).Select(bson.M{ "_id" : 0 }).One(&balancerConfig) - - var result float64 = 1 - if balancerConfig["stopped"] != nil { - balancerStopped := balancerConfig["stopped"].(bool) - if balancerStopped == true { - result = 0 - } - } - - return result -} - -func GetAllShardChunkInfo(session *mgo.Session) (map[string]int64) { - var result []map[string]int64 - err := session.DB("config").C("chunks").Pipe([]bson.M{{ "$group" : bson.M{ "_id" : "$shard", "count" : bson.M{ "$sum" : 1 } } }}).All(&result) - if err != nil { - glog.Error("Could not find shard chunk info!") - } - - shardChunkCounts := make(map[string]int64) - for _, element := range result { - shard := string(element["_id"]) - shardChunkCounts[shard] = int64(element["count"]) - } - - return shardChunkCounts -} - -func IsClusterBalanced(session *mgo.Session) (float64) { - // Different thresholds based on size - // http://docs.mongodb.org/manual/core/sharding-internals/#sharding-migration-thresholds - var threshold int64 - totalChunkCount := GetTotalChunks(session) - if totalChunkCount < 20 { - threshold = 2 - } else if totalChunkCount < 80 && totalChunkCount > 21 { - threshold = 4 - } else { - threshold = 8 - } - - var minChunkCount int64 = -1 - var maxChunkCount int64 = 0 - shardChunkInfoAll := GetAllShardChunkInfo(session) - for _, chunkCount := range shardChunkInfoAll { - if chunkCount > maxChunkCount { - maxChunkCount = chunkCount - } - if minChunkCount == -1 || chunkCount < minChunkCount { - minChunkCount = chunkCount - } - } - - // return true if the difference between the min and max is < the thresold - chunkDifference := maxChunkCount - minChunkCount - if chunkDifference < threshold { - return 1 - } - - return 0 -} - -type BalancerStats struct { - IsBalanced float64 - BalancerEnabled float64 -} - -func (status *BalancerStats) Export(ch chan<- prometheus.Metric) { - balancerIsEnabled.Set(status.IsBalanced) - balancerIsBalanced.Set(status.BalancerEnabled) - balancerIsEnabled.Collect(ch) - balancerIsBalanced.Collect(ch) -} - -func GetBalancerStatus(session *mgo.Session) *BalancerStats { - results := &BalancerStats{} - - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - - results.IsBalanced = IsClusterBalanced(session) - results.BalancerEnabled = IsBalancerEnabled(session) - - return results -} diff --git a/collector/mongos/balancer_topology.go b/collector/mongos/balancer_topology.go deleted file mode 100644 index 4f0f26d01..000000000 --- a/collector/mongos/balancer_topology.go +++ /dev/null @@ -1,100 +0,0 @@ -package collector_mongos - -import ( - "github.com/golang/glog" - "github.com/prometheus/client_golang/prometheus" - "gopkg.in/mgo.v2" - "gopkg.in/mgo.v2/bson" -) - -var ( - balancerTopoInfoTotalShards = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer_topo", - Name: "total_shards", - Help: "Total # of Shards in Cluster", - }) - balancerTopoInfoTotalChunks = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer_topo", - Name: "total_chunks", - Help: "Total # of Chunks in Cluster", - }) - balancerTopoInfoTotalDatabases = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer_topo", - Name: "total_databases", - Help: "Total # of Databases with Sharding Enabled", - }) - balancerTopoInfoTotalCollections = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: Namespace, - Subsystem: "balancer_topo", - Name: "total_collections", - Help: "Total # of Collections with Sharding Enabled", - }) -) - -func GetTotalShards(session *mgo.Session) (float64) { - shardCount, err := session.DB("config").C("shards").Find(bson.M{}).Count() - if err != nil { - glog.Error("Could not find shard information in 'config.settings'!") - } - return float64(shardCount) -} - -func GetTotalChunks(session *mgo.Session) (float64) { - chunkCount, err := session.DB("config").C("chunks").Find(bson.M{}).Count() - if err != nil { - glog.Error("Could not find chunk information in 'config.chunks'!") - } - return float64(chunkCount) -} - -func GetTotalShardedDatabases(session *mgo.Session) (float64) { - dbCount, err := session.DB("config").C("databases").Find(bson.M{ "partitioned" : true }).Count() - if err != nil { - glog.Error("Could not find database information in 'config.databases'!") - } - return float64(dbCount) -} - -func GetTotalShardedCollections(session *mgo.Session) (float64) { - collCount, err := session.DB("config").C("collections").Find(bson.M{ "dropped" : false }).Count() - if err != nil { - glog.Error("Could not find collection information in 'config.collections'!") - } - return float64(collCount) -} - -type BalancerTopoStats struct { - TotalShards float64 - TotalChunks float64 - TotalDatabases float64 - TotalCollections float64 -} - -func (status *BalancerTopoStats) Export(ch chan<- prometheus.Metric) { - balancerTopoInfoTotalShards.Set(status.TotalShards) - balancerTopoInfoTotalChunks.Set(status.TotalChunks) - balancerTopoInfoTotalDatabases.Set(status.TotalDatabases) - balancerTopoInfoTotalCollections.Set(status.TotalCollections) - - balancerTopoInfoTotalShards.Collect(ch) - balancerTopoInfoTotalChunks.Collect(ch) - balancerTopoInfoTotalDatabases.Collect(ch) - balancerTopoInfoTotalCollections.Collect(ch) -} - -func GetBalancerTopoStatus(session *mgo.Session) *BalancerTopoStats { - results := &BalancerTopoStats{} - - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - - results.TotalShards = GetTotalShards(session) - results.TotalChunks = GetTotalChunks(session) - results.TotalDatabases = GetTotalShardedDatabases(session) - results.TotalCollections = GetTotalShardedCollections(session) - - return results -} diff --git a/collector/mongos/connections_test.go b/collector/mongos/connections_test.go deleted file mode 100644 index 572c4e2d5..000000000 --- a/collector/mongos/connections_test.go +++ /dev/null @@ -1,15 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_ConnectionsCollectData(t *testing.T) { - stats := &ConnectionStats{ - Current: 1, - Available: 2, - TotalCreated: 3, - } - - stats.Export() -} diff --git a/collector/mongos/cursors_test.go b/collector/mongos/cursors_test.go deleted file mode 100644 index ae02e09c9..000000000 --- a/collector/mongos/cursors_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_CursorsCollectData(t *testing.T) { - cursors := &Cursors{} - - cursors.Export() -} diff --git a/collector/mongos/extra_info_test.go b/collector/mongos/extra_info_test.go deleted file mode 100644 index 70ceaf170..000000000 --- a/collector/mongos/extra_info_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_ExtraInfoCollectData(t *testing.T) { - stats := &ExtraInfo{} - - stats.Export() -} diff --git a/collector/mongos/memory_test.go b/collector/mongos/memory_test.go deleted file mode 100644 index e0f44309f..000000000 --- a/collector/mongos/memory_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_MemoryCollectData(t *testing.T) { - stats := &MemStats{} - - stats.Export() -} diff --git a/collector/mongos/metrics.go b/collector/mongos/metrics.go index 9ba1224d1..70419a0b1 100644 --- a/collector/mongos/metrics.go +++ b/collector/mongos/metrics.go @@ -78,7 +78,7 @@ type CursorStats struct { // Export exports the cursor stats. func (cursorStats *CursorStats) Export(ch chan<- prometheus.Metric) { metricsCursorTimedOutTotal.Set(cursorStats.TimedOut) - metricsCursorOpen.WithLabelValues("noTimeout").Set(cursorStats.Open.NoTimeout) + metricsCursorOpen.WithLabelValues("timed_out").Set(cursorStats.Open.NoTimeout) metricsCursorOpen.WithLabelValues("pinned").Set(cursorStats.Open.Pinned) metricsCursorOpen.WithLabelValues("total").Set(cursorStats.Open.Total) } diff --git a/collector/mongos/metrics_test.go b/collector/mongos/metrics_test.go deleted file mode 100644 index 980ba595d..000000000 --- a/collector/mongos/metrics_test.go +++ /dev/null @@ -1,15 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_MetricsCollectData(t *testing.T) { - stats := &MetricsStats{ - GetLastError: &GetLastErrorStats{ - Wtime: &BenchmarkStats{}, - }, - } - - stats.Export() -} diff --git a/collector/mongos/network_test.go b/collector/mongos/network_test.go deleted file mode 100644 index ef6f9e562..000000000 --- a/collector/mongos/network_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_NetworkCollectData(t *testing.T) { - stats := &NetworkStats{} - - stats.Export() -} diff --git a/collector/mongos/op_counters_test.go b/collector/mongos/op_counters_test.go deleted file mode 100644 index f7d1f6264..000000000 --- a/collector/mongos/op_counters_test.go +++ /dev/null @@ -1,11 +0,0 @@ -package collector_mongos - -import ( - "testing" -) - -func Test_OpCountersCollectData(t *testing.T) { - stats := &OpcountersStats{} - - stats.Export() -} diff --git a/collector/mongos/server_status.go b/collector/mongos/server_status.go index fe95b6b69..75e1f018f 100644 --- a/collector/mongos/server_status.go +++ b/collector/mongos/server_status.go @@ -56,6 +56,9 @@ func (status *ServerStatus) Export(ch chan<- prometheus.Metric) { instanceUptimeSeconds.Set(status.Uptime) instanceUptimeEstimateSeconds.Set(status.Uptime) instanceLocalTime.Set(float64(status.LocalTime.Unix())) + instanceUptimeSeconds.Collect(ch) + instanceUptimeEstimateSeconds.Collect(ch) + instanceLocalTime.Collect(ch) if status.Asserts != nil { status.Asserts.Export(ch) @@ -118,10 +121,6 @@ func (status *ServerStatus) Describe(ch chan<- *prometheus.Desc) { // GetServerStatus returns the server status info. func GetServerStatus(session *mgo.Session) *ServerStatus { result := &ServerStatus{} - - session.SetMode(mgo.Eventual, true) - session.SetSocketTimeout(0) - err := session.DB("admin").Run(bson.D{{"serverStatus", 1}, {"recordStats", 0}}, result) if err != nil { glog.Error("Failed to get server status.") diff --git a/collector/mongos/server_status_test.go b/collector/mongos/server_status_test.go index 60f26ec6b..a5a52653d 100644 --- a/collector/mongos/server_status_test.go +++ b/collector/mongos/server_status_test.go @@ -1,17 +1,10 @@ package collector_mongos import ( - "gopkg.in/mgo.v2/bson" "testing" -) - -func Test_ServerStatusCollectData(t *testing.T) { - data := LoadFixture("server_status.bson") - serverStatus := &ServerStatus{} - loadServerStatusFromBson(data, serverStatus) - serverStatus.Export() -} + "gopkg.in/mgo.v2/bson" +) func Test_ParserServerStatus(t *testing.T) { data := LoadFixture("server_status.bson") @@ -23,6 +16,14 @@ func Test_ParserServerStatus(t *testing.T) { t.Error("Asserts group was not loaded") } + if serverStatus.Dur == nil { + t.Error("Dur group was not loaded") + } + + if serverStatus.BackgroundFlushing == nil { + t.Error("BackgroundFlushing group was not loaded") + } + if serverStatus.Connections == nil { t.Error("Connections group was not loaded") } @@ -31,6 +32,10 @@ func Test_ParserServerStatus(t *testing.T) { t.Error("ExtraInfo group was not loaded") } + if serverStatus.GlobalLock == nil { + t.Error("GlobalLock group was not loaded") + } + if serverStatus.Network == nil { t.Error("Network group was not loaded") } @@ -39,6 +44,10 @@ func Test_ParserServerStatus(t *testing.T) { t.Error("Opcounters group was not loaded") } + if serverStatus.OpcountersRepl == nil { + t.Error("OpcountersRepl group was not loaded") + } + if serverStatus.Mem == nil { t.Error("Mem group was not loaded") } @@ -47,6 +56,10 @@ func Test_ParserServerStatus(t *testing.T) { t.Error("Connections group was not loaded") } + if serverStatus.Locks == nil { + t.Error("Locks group was not loaded") + } + if serverStatus.Metrics.Document.Deleted != 45726 { t.Error("Metrics group was not loaded correctly") } diff --git a/collector/mongos/sharding_changelog.go b/collector/mongos/sharding_changelog.go new file mode 100644 index 000000000..53c509bc0 --- /dev/null +++ b/collector/mongos/sharding_changelog.go @@ -0,0 +1,92 @@ +package collector_mongos + +import ( + "time" + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" +) + +var ( + shardingChangelogInfo = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "changelog_10min_total", + Help: "Total # of Cluster Balancer log events over the last 10 minutes", + }, []string{"event"}) +) + +type ShardingChangelogSummaryId struct { + Event string `bson:"event"` + Note string `bson:"note"` +} + +type ShardingChangelogSummary struct { + Id *ShardingChangelogSummaryId `bson:"_id"` + Count float64 `bson:"count"` +} + +type ShardingChangelogStats struct { + Items *[]ShardingChangelogSummary +} + +func (status *ShardingChangelogStats) Export(ch chan<- prometheus.Metric) { + // set all expected event types to zero first, so they show in results if there was no events in the current time period + shardingChangelogInfo.WithLabelValues("moveChunk.start").Set(0) + shardingChangelogInfo.WithLabelValues("moveChunk.to").Set(0) + shardingChangelogInfo.WithLabelValues("moveChunk.to_failed").Set(0) + shardingChangelogInfo.WithLabelValues("moveChunk.from").Set(0) + shardingChangelogInfo.WithLabelValues("moveChunk.from_failed").Set(0) + shardingChangelogInfo.WithLabelValues("moveChunk.commit").Set(0) + shardingChangelogInfo.WithLabelValues("addShard").Set(0) + shardingChangelogInfo.WithLabelValues("removeShard.start").Set(0) + shardingChangelogInfo.WithLabelValues("shardCollection").Set(0) + shardingChangelogInfo.WithLabelValues("shardCollection.start").Set(0) + shardingChangelogInfo.WithLabelValues("split").Set(0) + shardingChangelogInfo.WithLabelValues("multi-split").Set(0) + + // set counts for events found in our query + for _, item := range *status.Items { + event := item.Id.Event + note := item.Id.Note + count := item.Count + switch event { + case "moveChunk.to": + if note == "success" || note == "" { + shardingChangelogInfo.WithLabelValues(event).Set(count) + } else { + shardingChangelogInfo.WithLabelValues(event + "_failed").Set(count) + } + case "moveChunk.from": + if note == "success" || note == "" { + shardingChangelogInfo.WithLabelValues(event).Set(count) + } else { + shardingChangelogInfo.WithLabelValues(event + "_failed").Set(count) + } + default: + shardingChangelogInfo.WithLabelValues(event).Set(count) + } + } + shardingChangelogInfo.Collect(ch) +} + +func (status *ShardingChangelogStats) Describe(ch chan<- *prometheus.Desc) { + shardingChangelogInfo.Describe(ch) +} + +func GetShardingChangelogStatus(session *mgo.Session) *ShardingChangelogStats { + var qresults []ShardingChangelogSummary + coll := session.DB("config").C("changelog") + match := bson.M{ "time" : bson.M{ "$gt" : time.Now().Add(-10 * time.Minute) } } + group := bson.M{ "_id" : bson.M{ "event" : "$what", "note" : "$details.note" }, "count" : bson.M{ "$sum" : 1 } } + + err := coll.Pipe([]bson.M{ { "$match" : match }, { "$group" : group } }).All(&qresults) + if err != nil { + glog.Error("Failed to execute find query on 'config.changelog'!") + } + + results := &ShardingChangelogStats{} + results.Items = &qresults + return results +} diff --git a/collector/mongos/sharding_status.go b/collector/mongos/sharding_status.go new file mode 100644 index 000000000..1c16ad4e2 --- /dev/null +++ b/collector/mongos/sharding_status.go @@ -0,0 +1,245 @@ +package collector_mongos + +import ( + "time" + "strings" + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" +) + +var ( + balancerIsEnabled = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "balancer_enabled", + Help: "Boolean reporting if cluster balancer is enabled (1 = enabled/0 = disabled)", + }) + balancerChunksBalanced = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "chunks_is_balanced", + Help: "Boolean reporting if cluster chunks are evenly balanced across shards (1 = yes/0 = no)", + }) + mongosClusterIdHex = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "cluster_id", + Help: "The hex representation of the Cluster ID", + }, []string{"hex"}) + mongosClusterCurrentVersion = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "cluster_current_version", + Help: "The current metadata version number of the Cluster", + }) + mongosClusterMinCompatVersion = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "cluster_min_compatible_version", + Help: "The minimum compatible metadata version number of the Cluster", + }) + mongosUpSecs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "mongos_uptime_seconds", + Help: "The uptime of the Mongos processes in seconds", + }, []string{"name","version"}) + mongosPing = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "mongos_last_ping_timestamp", + Help: "The unix timestamp of the last Mongos ping to the Cluster config servers", + }, []string{"name","version"}) + mongosBalancerLockTimestamp = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "balancer_lock_timestamp", + Help: "The unix timestamp of the last update to the Cluster balancer lock", + }, []string{"name"}) + mongosBalancerLockState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "balancer_lock_state", + Help: "The state of the Cluster balancer lock (-1 = none/0 = unlocked/1 = contention/2 = locked)", + }, []string{"name","version"}) +) + +type ShardingVersion struct { + MinCompatVersion float64 `bson:"minCompatibleVersion"` + CurrentVersion float64 `bson:"currentVersion"` + ClusterId bson.ObjectId `bson:"clusterId"` +} + +type MongosInfo struct { + Name string `bson:"_id"` + Ping time.Time `bson:"ping"` + Up float64 `bson:"up"` + Waiting bool `bson:"waiting"` + MongoVersion string `bson:"mongoVersion"` +} + +type MongosBalancerLock struct { + State float64 `bson:"state"` + Process string `bson:"process"` + Who string `bson:"who"` + When time.Time `bson:"when"` + Why string `bson:"why"` +} + +type ShardingStats struct { + IsBalanced float64 + BalancerEnabled float64 + Changelog *ShardingChangelogStats + Topology *ShardingTopoStats + BalancerLock *MongosBalancerLock + Version *ShardingVersion + Mongos *[]MongosInfo +} + +func GetShardingVersion(session *mgo.Session) *ShardingVersion { + mongosVersion := &ShardingVersion{} + err := session.DB("config").C("version").Find(bson.M{ "_id" : 1 }).One(&mongosVersion) + if err != nil { + glog.Error("Failed to execute find query on 'config.version'!") + } + return mongosVersion +} + +func GetMongosInfo(session *mgo.Session) *[]MongosInfo { + mongosInfo := []MongosInfo{} + err := session.DB("config").C("mongos").Find(bson.M{ "ping" : bson.M{ "$gte" : time.Now().Add(-10 * time.Minute) } }).All(&mongosInfo) + if err != nil { + glog.Error("Failed to execute find query on 'config.mongos'!") + } + return &mongosInfo +} + +func GetMongosBalancerLock(session *mgo.Session) *MongosBalancerLock { + var balancerLock *MongosBalancerLock + err := session.DB("config").C("locks").Find(bson.M{ "_id" : "balancer" }).One(&balancerLock) + if err != nil { + glog.Error("Failed to execute find query on 'config.locks'!") + } + return balancerLock +} + +func IsBalancerEnabled(session *mgo.Session) float64 { + balancerConfig := struct { + Stopped bool `bson:"stopped"` + }{} + err := session.DB("config").C("settings").Find(bson.M{ "_id" : "balancer" }).One(&balancerConfig) + if err != nil { + return 1 + } + if balancerConfig.Stopped == true { + return 0 + } + return 1 +} + +func IsClusterBalanced(session *mgo.Session) float64 { + // Different thresholds based on size + // http://docs.mongodb.org/manual/core/sharding-internals/#sharding-migration-thresholds + var threshold float64 = 8 + totalChunkCount := GetTotalChunks(session) + if totalChunkCount < 20 { + threshold = 2 + } else if totalChunkCount < 80 && totalChunkCount > 21 { + threshold = 4 + } + + var minChunkCount float64 = -1 + var maxChunkCount float64 = 0 + shardChunkInfoAll := GetTotalChunksByShard(session) + for _, shard := range *shardChunkInfoAll { + if shard.Chunks > maxChunkCount { + maxChunkCount = shard.Chunks + } + if minChunkCount == -1 || shard.Chunks < minChunkCount { + minChunkCount = shard.Chunks + } + } + + // return true if the difference between the min and max is < the thresold + chunkDifference := maxChunkCount - minChunkCount + if chunkDifference < threshold { + return 1 + } + + return 0 +} + +func (status *ShardingStats) Export(ch chan<- prometheus.Metric) { + if status.Changelog != nil { + status.Changelog.Export(ch) + } + if status.Topology != nil { + status.Topology.Export(ch) + } + if status.Version != nil { + clusterId := status.Version.ClusterId.Hex() + mongosClusterIdHex.WithLabelValues(clusterId).Set(1) + mongosClusterCurrentVersion.Set(status.Version.CurrentVersion) + mongosClusterMinCompatVersion.Set(status.Version.MinCompatVersion) + } + if status.Mongos != nil && status.BalancerLock != nil { + mongosBalancerLockWho := strings.Split(status.BalancerLock.Who, ":") + mongosBalancerLockHostPort := mongosBalancerLockWho[0] + ":" + mongosBalancerLockWho[1] + mongosBalancerLockTimestamp.WithLabelValues(mongosBalancerLockHostPort).Set(float64(status.BalancerLock.When.Unix())) + for _, mongos := range *status.Mongos { + labels := prometheus.Labels{"name": mongos.Name, "version": mongos.MongoVersion } + mongosUpSecs.With(labels).Set(mongos.Up) + mongosPing.With(labels).Set(float64(mongos.Ping.Unix())) + mongosBalancerLockState.With(labels).Set(-1) + if mongos.Name == mongosBalancerLockHostPort { + mongosBalancerLockState.With(labels).Set(status.BalancerLock.State) + } + } + } + balancerIsEnabled.Set(status.IsBalanced) + balancerChunksBalanced.Set(status.BalancerEnabled) + + balancerIsEnabled.Collect(ch) + balancerChunksBalanced.Collect(ch) + mongosClusterIdHex.Collect(ch) + mongosClusterCurrentVersion.Collect(ch) + mongosClusterMinCompatVersion.Collect(ch) + mongosUpSecs.Collect(ch) + mongosPing.Collect(ch) + mongosBalancerLockState.Collect(ch) + mongosBalancerLockTimestamp.Collect(ch) +} + +func (status *ShardingStats) Describe(ch chan<- *prometheus.Desc) { + if status.Changelog != nil { + status.Changelog.Describe(ch) + } + if status.Topology != nil { + status.Topology.Describe(ch) + } + balancerIsEnabled.Describe(ch) + balancerChunksBalanced.Describe(ch) + mongosClusterIdHex.Describe(ch) + mongosClusterCurrentVersion.Describe(ch) + mongosClusterMinCompatVersion.Describe(ch) + mongosUpSecs.Describe(ch) + mongosPing.Describe(ch) + mongosBalancerLockState.Describe(ch) + mongosBalancerLockTimestamp.Describe(ch) +} + +func GetShardingStatus(session *mgo.Session) *ShardingStats { + results := &ShardingStats{} + + results.IsBalanced = IsClusterBalanced(session) + results.BalancerEnabled = IsBalancerEnabled(session) + results.Changelog = GetShardingChangelogStatus(session) + results.Topology = GetShardingTopoStatus(session) + results.Version = GetShardingVersion(session) + results.Mongos = GetMongosInfo(session) + results.BalancerLock = GetMongosBalancerLock(session) + + return results +} diff --git a/collector/mongos/sharding_topology.go b/collector/mongos/sharding_topology.go new file mode 100644 index 000000000..f543ea3ed --- /dev/null +++ b/collector/mongos/sharding_topology.go @@ -0,0 +1,181 @@ +package collector_mongos + +import ( + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" +) + +var ( + shardingTopoInfoTotalShards = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "shards_total", + Help: "Total # of Shards in the Cluster", + }) + shardingTopoInfoDrainingShards = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "shards_draining_total", + Help: "Total # of Shards in the Cluster in draining state", + }) + shardingTopoInfoTotalChunks = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "chunks_total", + Help: "Total # of Chunks in the Cluster", + }) + shardingTopoInfoShardChunks = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "shard_chunks_total", + Help: "Total number of chunks per shard", + }, []string{"shard"}) + shardingTopoInfoTotalDatabases = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "databases_total", + Help: "Total # of Databases in the Cluster", + }, []string{"type"}) + shardingTopoInfoTotalCollections = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: "sharding", + Name: "collections_total", + Help: "Total # of Collections with Sharding enabled", + }) +) + +type ShardingTopoShardInfo struct { + Shard string `bson:"_id"` + Host string `bson:"host"` + Draining bool `bson:"draining",omitifempty` +} + +type ShardingTopoChunkInfo struct { + Shard string `bson:"_id"` + Chunks float64 `bson:"count"` +} + +type ShardingTopoStatsTotalDatabases struct { + Partitioned bool `bson:"_id"` + Total float64 `bson:"total"` +} + +type ShardingTopoStats struct { + TotalChunks float64 + TotalCollections float64 + TotalDatabases *[]ShardingTopoStatsTotalDatabases + Shards *[]ShardingTopoShardInfo + ShardChunks *[]ShardingTopoChunkInfo +} + +func GetShards(session *mgo.Session) *[]ShardingTopoShardInfo { + var shards []ShardingTopoShardInfo + err := session.DB("config").C("shards").Find(bson.M{}).All(&shards) + if err != nil { + glog.Error("Failed to execute find query on 'config.shards'!") + } + return &shards +} + +func GetTotalChunks(session *mgo.Session) float64 { + chunkCount, err := session.DB("config").C("chunks").Find(bson.M{}).Count() + if err != nil { + glog.Error("Failed to execute find query on 'config.chunks'!") + } + return float64(chunkCount) +} + +func GetTotalChunksByShard(session *mgo.Session) *[]ShardingTopoChunkInfo { + var results []ShardingTopoChunkInfo + err := session.DB("config").C("chunks").Pipe([]bson.M{{ "$group" : bson.M{ "_id" : "$shard", "count" : bson.M{ "$sum" : 1 } } }}).All(&results) + if err != nil { + glog.Error("Failed to execute find query on 'config.chunks'!") + } + return &results +} + +func GetTotalDatabases(session *mgo.Session) *[]ShardingTopoStatsTotalDatabases { + results := []ShardingTopoStatsTotalDatabases{} + query := []bson.M{ { "$match" : bson.M{ "_id" : bson.M{ "$ne" : "admin" } } }, { "$group" : bson.M{ "_id" : "$partitioned", "total" : bson.M{ "$sum" : 1 } } } } + err := session.DB("config").C("databases").Pipe(query).All(&results) + if err != nil { + glog.Error("Failed to execute find query on 'config.databases'!") + } + return &results +} + +func GetTotalShardedCollections(session *mgo.Session) float64 { + collCount, err := session.DB("config").C("collections").Find(bson.M{ "dropped" : false }).Count() + if err != nil { + glog.Error("Failed to execute find query on 'config.collections'!") + } + return float64(collCount) +} + +func (status *ShardingTopoStats) Export(ch chan<- prometheus.Metric) { + if status.Shards != nil { + var drainingShards float64 = 0 + for _, shard := range *status.Shards { + if shard.Draining == true { + drainingShards = drainingShards + 1 + } + } + shardingTopoInfoDrainingShards.Set(drainingShards) + shardingTopoInfoTotalShards.Set(float64(len(*status.Shards))) + } + shardingTopoInfoTotalChunks.Set(status.TotalChunks) + shardingTopoInfoTotalCollections.Set(status.TotalCollections) + + shardingTopoInfoTotalDatabases.WithLabelValues("partitioned").Set(0) + shardingTopoInfoTotalDatabases.WithLabelValues("unpartitioned").Set(0) + if status.TotalDatabases != nil { + for _, item := range *status.TotalDatabases { + switch item.Partitioned { + case true: + shardingTopoInfoTotalDatabases.WithLabelValues("partitioned").Set(item.Total) + case false: + shardingTopoInfoTotalDatabases.WithLabelValues("unpartitioned").Set(item.Total) + } + } + } + + if status.ShardChunks != nil { + // set all known shards to zero first so that shards with zero chunks are still displayed properly + for _, shard := range *status.Shards { + shardingTopoInfoShardChunks.WithLabelValues(shard.Shard).Set(0) + } + for _, shard := range *status.ShardChunks { + shardingTopoInfoShardChunks.WithLabelValues(shard.Shard).Set(shard.Chunks) + } + } + + shardingTopoInfoTotalShards.Collect(ch) + shardingTopoInfoDrainingShards.Collect(ch) + shardingTopoInfoTotalChunks.Collect(ch) + shardingTopoInfoShardChunks.Collect(ch) + shardingTopoInfoTotalCollections.Collect(ch) + shardingTopoInfoTotalDatabases.Collect(ch) +} + +func (status *ShardingTopoStats) Describe(ch chan<- *prometheus.Desc) { + shardingTopoInfoTotalShards.Describe(ch) + shardingTopoInfoDrainingShards.Describe(ch) + shardingTopoInfoTotalChunks.Describe(ch) + shardingTopoInfoShardChunks.Describe(ch) + shardingTopoInfoTotalDatabases.Describe(ch) + shardingTopoInfoTotalCollections.Describe(ch) +} + +func GetShardingTopoStatus(session *mgo.Session) *ShardingTopoStats { + results := &ShardingTopoStats{} + + results.Shards = GetShards(session) + results.TotalChunks = GetTotalChunks(session) + results.ShardChunks = GetTotalChunksByShard(session) + results.TotalDatabases = GetTotalDatabases(session) + results.TotalCollections = GetTotalShardedCollections(session) + + return results +} diff --git a/collector/shared/version.go b/collector/shared/version.go deleted file mode 100644 index da2046f75..000000000 --- a/collector/shared/version.go +++ /dev/null @@ -1,42 +0,0 @@ -package collector_shared - -import( - "strings" - "strconv" - "github.com/golang/glog" - "gopkg.in/mgo.v2" -) - -var serverVersion string = "unknown" - -func GetServerVersion(session *mgo.Session) (string) { - if serverVersion == "unknown" { - buildInfo, err := session.BuildInfo() - if err != nil { - glog.Error("Could not get BuildInfo!") - return "unknown" - } - serverVersion = buildInfo.Version - return serverVersion - } else { - return serverVersion - } -} - -func IsVersionGreater(version string, major int, minor int, release int) (bool) { - split := strings.Split(version, ".") - cmp_major, _ := strconv.Atoi(split[0]) - cmp_minor, _ := strconv.Atoi(split[1]) - cmp_release, _ := strconv.Atoi(split[2]) - - if cmp_major >= major && cmp_minor >= minor && cmp_release >= release { - return true - } - - return false -} - -func IsMyVersionGreater(session *mgo.Session, major int, minor int, release int) (bool) { - version := GetServerVersion(session) - return IsVersionGreater(version, major, minor, release) -} diff --git a/mongodb_exporter.go b/mongodb_exporter.go index fc1f3f077..f9a92fe36 100644 --- a/mongodb_exporter.go +++ b/mongodb_exporter.go @@ -3,21 +3,30 @@ package main import ( "flag" "fmt" - "github.com/dcu/mongodb_exporter/collector" - "github.com/dcu/mongodb_exporter/shared" - "github.com/prometheus/client_golang/prometheus" "net/http" + "os" + + "github.com/Percona-Lab/prometheus_mongodb_exporter/collector" + "github.com/Percona-Lab/prometheus_mongodb_exporter/shared" + + "github.com/prometheus/client_golang/prometheus" ) +func mongodbDefaultUri() string { + if u := os.Getenv("MONGODB_URL"); u != "" { + return u + } + return "mongodb://localhost:27017" +} + var ( listenAddressFlag = flag.String("web.listen-address", ":9001", "Address on which to expose metrics and web interface.") metricsPathFlag = flag.String("web.metrics-path", "/metrics", "Path under which to expose metrics.") - mongodbURIFlag = flag.String("mongodb.uri", "mongodb://localhost:27017", "Mongodb URI, format: [mongodb://][user:pass@]host1[:port1][,host2[:port2],...][/database][?options]") + mongodbURIFlag = flag.String("mongodb.uri", mongodbDefaultUri(), "Mongodb URI, format: [mongodb://][user:pass@]host1[:port1][,host2[:port2],...][/database][?options]") enabledGroupsFlag = flag.String("groups.enabled", "asserts,durability,background_flushing,connections,extra_info,global_lock,index_counters,network,op_counters,op_counters_repl,memory,locks,metrics", "Comma-separated list of groups to use, for more info see: docs.mongodb.org/manual/reference/command/serverStatus/") - //printCollectors = flag.Bool("collectors.print", false, "If true, print available collectors and exit.") - authUserFlag = flag.String("auth.user", "", "Username for basic auth.") - authPassFlag = flag.String("auth.pass", "", "Password for basic auth.") + authUserFlag = flag.String("auth.user", "", "Username for basic auth.") + authPassFlag = flag.String("auth.pass", "", "Password for basic auth.") ) type basicAuthHandler struct { diff --git a/shared/connection.go b/shared/connection.go new file mode 100644 index 000000000..47684918e --- /dev/null +++ b/shared/connection.go @@ -0,0 +1,65 @@ +package shared + +import ( + "time" + + "github.com/golang/glog" + "gopkg.in/mgo.v2" +) + +const ( + dialMongodbTimeout = 10 * time.Second + syncMongodbTimeout = 1 * time.Minute +) + +func MongoSession(uri string) *mgo.Session { + dialInfo, err := mgo.ParseURL(uri) + if err != nil { + glog.Errorf("Cannot connect to server using url %s: %s", uri, err) + return nil + } + + dialInfo.Direct = true // Force direct connection + dialInfo.Timeout = dialMongodbTimeout + + session, err := mgo.DialWithInfo(dialInfo) + if err != nil { + glog.Errorf("Cannot connect to server using url %s: %s", uri, err) + return nil + } + session.SetMode(mgo.Eventual, true) + session.SetSyncTimeout(syncMongodbTimeout) + session.SetSocketTimeout(0) + return session +} + +func MongoSessionServerVersion(session *mgo.Session) (string, error) { + buildInfo, err := session.BuildInfo() + if err != nil { + glog.Errorf("Could not get MongoDB BuildInfo: %s!", err) + return "unknown", err + } + return buildInfo.Version, nil +} + +func MongoSessionNodeType(session *mgo.Session) (string, error) { + masterDoc := struct { + SetName interface{} `bson:"setName"` + Hosts interface{} `bson:"hosts"` + Msg string `bson:"msg"` + }{} + err := session.Run("isMaster", &masterDoc) + if err != nil { + glog.Errorf("Got unknown node type: %s", err) + return "unknown", err + } + + if masterDoc.SetName != nil || masterDoc.Hosts != nil { + return "replset", nil + } else if masterDoc.Msg == "isdbgrid" { + // isdbgrid is always the msg value when calling isMaster on a mongos + // see http://docs.mongodb.org/manual/core/sharded-cluster-query-router/ + return "mongos", nil + } + return "mongod", nil +}