From fc381e38e08b4d3d9f825d43565a6d22c7754253 Mon Sep 17 00:00:00 2001
From: Mike Dirolf <mike@10gen.com>
Date: Mon, 20 Apr 2009 16:58:20 -0400
Subject: [PATCH] new benchmark setup

---
 tools/benchmark.py               | 247 +++++++++++++++++--------------
 tools/mongodb_benchmark_tools.py |  53 +++++++
 2 files changed, 191 insertions(+), 109 deletions(-)
 create mode 100644 tools/mongodb_benchmark_tools.py

diff --git a/tools/benchmark.py b/tools/benchmark.py
index 6f23e89235..f969351935 100644
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
@@ -16,125 +16,154 @@
 
 import time
 import sys
-sys.path[0:0] = [""]
-
+import os
 import datetime
-import cProfile
+import subprocess
 
-from pymongo import connection
+from pymongo.connection import Connection
+from pymongo.bson import BSON
+from pymongo.binary import Binary
 from pymongo import ASCENDING
 
-trials = 2
-per_trial = 5000
-batch_size = 100
-small = {}
-medium = {"integer": 5,
-          "number": 5.05,
-          "boolean": False,
-          "array": ["test", "benchmark"]
+from mongodb_benchmark_tools import post_data
+
+small = {"integer": 5,
+         "number": 5.05,
+         "boolean": False,
+         "array": ["test", "benchmark"]
+         }
+medium = {"base_url": "http://www.example.com/test-me",
+          "total_word_count": 6743,
+          "access_time": datetime.datetime.utcnow(),
+          "sub_object": small,
+          "data": Binary("hello" * 40),
+          "big_array": ["mongodb"] * 20
           }
-# this is similar to the benchmark data posted to the user list
-large = {"base_url": "http://www.example.com/test-me",
-         "total_word_count": 6743,
-         "access_time": datetime.datetime.utcnow(),
-         "meta_tags": {"description": "i am a long description string",
-                       "author": "Holly Man",
-                       "dynamically_created_meta_tag": "who know\n what"
-                       },
-         "page_structure": {"counted_tags": 3450,
-                            "no_of_js_attached": 10,
-                            "no_of_images": 6
-                            },
-         "harvested_words": ["10gen","web","open","source","application","paas",
-                             "platform-as-a-service","technology","helps",
-                             "developers","focus","building","mongodb","mongo"] * 20
+large = {"bigger_array": [medium] * 5,
+         "data": Binary("hello" * 500)
          }
 
-def setup_insert(db, collection, object):
-    db.drop_collection(collection)
-
-def insert(db, collection, object):
-    for i in range(per_trial):
-        to_insert = object.copy()
-        to_insert["x"] = i
-        db[collection].insert(to_insert)
-
-def insert_batch(db, collection, object):
-    for i in range(per_trial / batch_size):
-        db[collection].insert([object] * batch_size)
-
-def find_one(db, collection, x):
-    for _ in range(per_trial):
-        db[collection].find_one({"x": x})
-
-def find(db, collection, x):
-    for _ in range(per_trial):
-        for _ in db[collection].find({"x": x}):
-            pass
-
-def timed(name, function, args=[], setup=None):
-    times = []
-    for _ in range(trials):
-        if setup:
-            setup(*args)
+
+class Benchmark(object):
+    name = "benchmark"
+    description = "a benchmark"
+    categories = []
+
+    def setup(self):
+        pass
+
+    def run(self, iterations):
+        pass
+
+    def teardown(self):
+        pass
+
+
+class Encode(Benchmark):
+    def __init__(self, document, size):
+        self.name = "encode %s" % size
+        self.description = "test encoding 10000 %s documents" % size
+        self.categories = ["encode", size]
+        self.__doc = document
+
+    def run(self, iterations):
+        for _ in range(iterations):
+            BSON.from_dict(self.__doc)
+
+
+class Decode(Benchmark):
+    def __init__(self, bson, size):
+        self.name = "decode %s" % size
+        self.description = "test decoding 10000 %s documents" % size
+        self.categories = ["decode", size]
+        self.__bson = bson
+
+    def run(self, iterations):
+        for _ in range(iterations):
+            self.__bson.to_dict()
+
+
+class Insert(Benchmark):
+    def __init__(self, db, document, size):
+        self.__db = db
+        self.__collection_name = "%s_no_index" % size
+        self.__document = document
+        self.name = "insert %s" % size
+        self.description = "test inserting 10000 %s sized documents into a single collection"
+        self.categories = ["insert", size, "no index"]
+
+    def setup(self):
+        self.__db.drop_collection(self.__collection_name)
+        self.__collection = self.__db[self.__collection_name]
+
+    def run(self, iterations):
+        for i in range(iterations):
+            doc = self.__document.copy()
+            doc["x"] = i
+            self.__collection.insert(doc)
+
+
+class FindOne(Benchmark):
+    def __init__(self, collection, query, size):
+        self.__collection = collection
+        self.__query = query
+        self.name = "find one %s" % size
+        self.description = "test doing 10000 find one queries on a collection containing %s sized documents" % size
+        self. categories = ["query", size, "find one", "no index"]
+
+    def run(self, iterations):
+        for _ in range(iterations):
+            self.__collection.find_one(self.__query)
+
+
+class BenchmarkRunner(object):
+    def __init__(self, iterations, server_hash):
+        self.__iterations = iterations
+        self.__server_hash = server_hash
+        self.__client_hash = self.get_client_hash()
+
+    def get_client_hash(self):
+        git_rev_parse = subprocess.Popen(["git", "rev-parse", "HEAD"],
+                                         stdout=subprocess.PIPE)
+        (hash, _) = git_rev_parse.communicate()
+        return hash.strip()
+
+    def report(self, benchmark, result):
+        data = {"benchmark": {"project": "http://github.com/mongodb/mongo-python-driver",
+                              "name": benchmark.name,
+                              "description": benchmark.description,
+                              "tags": benchmark.categories},
+                "trial": {"server_hash": self.__server_hash,
+                          "client_hash": self.__client_hash,
+                          "result": result,
+                          "extra_info": ""}}
+        post_data(data, post_url="http://localhost:8080/benchmark")
+        print "%s: %s" % (benchmark.name, result)
+
+    def run_benchmark(self, benchmark):
+        benchmark.setup()
         start = time.time()
-        function(*args)
-        times.append(time.time() - start)
-    best_time = min(times)
-    print "%s%d" % (name + (60 - len(name)) * ".", per_trial / best_time)
-    return best_time
+        benchmark.run(self.__iterations)
+        stop = time.time()
+        benchmark.teardown()
+        self.report(benchmark, stop - start)
+
 
 def main():
-    connection._TIMEOUT=60 # jack up the timeout
-    c = connection.Connection()
-    c.drop_database("benchmark")
-    db = c.benchmark
-
-    timed("insert (small, no index)", insert, [db, 'small_none', small], setup_insert)
-    timed("insert (medium, no index)", insert, [db, 'medium_none', medium], setup_insert)
-    timed("insert (large, no index)", insert, [db, 'large_none', large], setup_insert)
-
-    db.small_index.create_index("x", ASCENDING)
-    timed("insert (small, indexed)", insert, [db, 'small_index', small])
-    db.medium_index.create_index("x", ASCENDING)
-    timed("insert (medium, indexed)", insert, [db, 'medium_index', medium])
-    db.large_index.create_index("x", ASCENDING)
-    timed("insert (large, indexed)", insert, [db, 'large_index', large])
-
-    timed("batch insert (small, no index)", insert_batch, [db, 'small_bulk', small], setup_insert)
-    timed("batch insert (medium, no index)", insert_batch, [db, 'medium_bulk', medium], setup_insert)
-    timed("batch insert (large, no index)", insert_batch, [db, 'large_bulk', large], setup_insert)
-
-    timed("find_one (small, no index)", find_one, [db, 'small_none', per_trial / 2])
-    timed("find_one (medium, no index)", find_one, [db, 'medium_none', per_trial / 2])
-    timed("find_one (large, no index)", find_one, [db, 'large_none', per_trial / 2])
-
-    timed("find_one (small, indexed)", find_one, [db, 'small_index', per_trial / 2])
-    timed("find_one (medium, indexed)", find_one, [db, 'medium_index', per_trial / 2])
-    timed("find_one (large, indexed)", find_one, [db, 'large_index', per_trial / 2])
-
-    timed("find (small, no index)", find, [db, 'small_none', per_trial / 2])
-    timed("find (medium, no index)", find, [db, 'medium_none', per_trial / 2])
-    timed("find (large, no index)", find, [db, 'large_none', per_trial / 2])
-
-    timed("find (small, indexed)", find, [db, 'small_index', per_trial / 2])
-    timed("find (medium, indexed)", find, [db, 'medium_index', per_trial / 2])
-    timed("find (large, indexed)", find, [db, 'large_index', per_trial / 2])
-
-#     timed("find range (small, no index)", find,
-#           [db, 'small_none', {"$gt": per_trial / 4, "$lt": 3 * per_trial / 4}])
-#     timed("find range (medium, no index)", find,
-#           [db, 'medium_none', {"$gt": per_trial / 4, "$lt": 3 * per_trial / 4}])
-#     timed("find range (large, no index)", find,
-#           [db, 'large_none', {"$gt": per_trial / 4, "$lt": 3 * per_trial / 4}])
-
-    timed("find range (small, indexed)", find,
-          [db, 'small_index', {"$gt": per_trial / 2, "$lt": per_trial / 2 + batch_size}])
-    timed("find range (medium, indexed)", find,
-          [db, 'medium_index', {"$gt": per_trial / 2, "$lt": per_trial / 2 + batch_size}])
-    timed("find range (large, indexed)", find,
-          [db, 'large_index', {"$gt": per_trial / 2, "$lt": per_trial / 2 + batch_size}])
+    connection = Connection()
+    runner = BenchmarkRunner(10000, connection.server_info()["gitVersion"])
+
+    runner.run_benchmark(Encode(small, "small"))
+    runner.run_benchmark(Encode(medium, "medium"))
+    runner.run_benchmark(Encode(large, "large"))
+
+    runner.run_benchmark(Decode(BSON.from_dict(small), "small"))
+    runner.run_benchmark(Decode(BSON.from_dict(medium), "medium"))
+    runner.run_benchmark(Decode(BSON.from_dict(large), "large"))
+
+    runner.run_benchmark(Insert(connection.benchmark, medium, "medium"))
+
+    runner.run_benchmark(FindOne(connection.benchmark.medium_no_index, {"x": 5000}, "medium"))
 
 if __name__ == "__main__":
-#    cProfile.run("main()")
     main()
diff --git a/tools/mongodb_benchmark_tools.py b/tools/mongodb_benchmark_tools.py
new file mode 100644
index 0000000000..283c8993d9
--- /dev/null
+++ b/tools/mongodb_benchmark_tools.py
@@ -0,0 +1,53 @@
+import os
+import urllib
+import urllib2
+try:
+    import json
+except:
+    import simplejson as json # need simplejson for python < 2.6
+
+import settings
+
+def machine_info(extra_info=""):
+    """Get a dict representing the "machine" section of a benchmark result.
+
+    ie:
+    {
+        "os_name": "OS X",
+        "os_version": "10.5",
+        "processor": "2.4 GHz Intel Core 2 Duo",
+        "memory": "3 GB 667 MHz DDR2 SDRAM",
+        "extra_info": "Python 2.6"
+    }
+
+    Must have a settings.py file on sys.path that defines "processor" and "memory"
+    variables.
+    """
+    machine = {}
+    (machine["os_name"], _, machine["os_version"], _, _) = os.uname()
+    machine["processor"] = settings.processor
+    machine["memory"] = settings.memory
+    machine["extra_info"] = extra_info
+    return machine
+
+def post_data(data, machine_extra_info="", post_url="http://mongo-db.appspot.com/benchmark"):
+    """Post a benchmark data point.
+
+    data should be a Python dict that looks like:
+        {
+          "benchmark": {
+            "project": "http://github.com/mongodb/mongo-python-driver",
+            "name": "insert test",
+            "description": "test inserting 10000 documents with the C extension enabled",
+            "tags": ["insert", "python"]
+          },
+          "trial": {
+            "server_hash": "4f5a8d52f47507a70b6c625dfb5dbfc87ba5656a",
+            "client_hash": "8bf2ad3d397cbde745fd92ad41c5b13976fac2b5",
+            "result": 67.5,
+            "extra_info": "some logs or something"
+          }
+        }
+    """
+    data["machine"] = machine_info(machine_extra_info)
+    urllib2.urlopen(post_url, urllib.urlencode({"payload": json.dumps(data)}))