Handling of website publication

Fixes #73
mozilla · Oct 24, 2016 · 781be5d · 781be5d
1 parent b7cbf61
commit 781be5d
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 5 deletions.
diff --git a/DeepSpeech.ipynb b/DeepSpeech.ipynb
@@ -94,7 +94,9 @@
     "ds_dataset_path = os.environ.get('ds_dataset_path', './data/ted')\n",
     "\n",
     "import importlib\n",
-    "ds_importer_module = importlib.import_module('util.importers.%s' % ds_importer)"
+    "ds_importer_module = importlib.import_module('util.importers.%s' % ds_importer)\n",
+    "\n",
+    "from util.website import maybe_publish"
    ]
   },
   {
@@ -1411,7 +1413,8 @@
    },
    "outputs": [],
    "source": [
-    "merge_logs(logs_dir)"
+    "merge_logs(logs_dir)\n",
+    "maybe_publish()"
    ]
   }
  ],

diff --git a/README.website.md b/README.website.md
@@ -0,0 +1,70 @@
+Overview of the process for publishing WER
+==========================================
+
+The tracking of WER is made using the following workflow:
+* a dedicated user on the learning machine periodically runs training jobs (cron
+  job, or manual runs)
+* this produces, mostly, js/hyper.js containig a concatenated version of all
+  previous runs
+* util/website.py contains code that will connect to a SSH server, using SFTP
+* this will publish 'index.html' and its dependencies
+
+# Setup of the dedicated user:
+
+* Create a standard user
+* Either rely on system's tensorflow or populate a virtualenv
+* Using system tensorflow or a virtualenv might require setting the PYTHONPATH
+  env variable (done for system wide tensorflow installation in the example
+  below).
+* Install PIP dependencies:
+ * jupyter
+ * BeautifulSoup4
+ * GitPython
+ * pysftp
+ * xdg
+ * requests
+* Construct cron job:
+```
+SHELL=/bin/bash
+PATH=/usr/local/bin:/usr/bin/:/bin
+# Run WER every 15 mins
+*/5 *  *   *   *    (mkdir -p $HOME/wer && cd $HOME/wer && source /usr/local/tensorflow-env/bin/activate && /usr/bin/curl -H "Cache-Control: no-cache" -L https://raw.githubusercontent.com/mozilla/DeepSpeech/website/util/automation.py | ds_website_username="u" ds_website_privkey="$HOME/.ssh/k" ds_website_server_fqdn="host.tld" ds_website_server_root="www/" ds_wer_automation="./bin/run-wer-automation.sh" python ; cd) 2>$HOME/.deepspeech_wer.err.log 1>$HOME/.deepspeech_wer.out.log
+```
+* Cron task will take care of:
+ * checking if any there were any new merges
+ * perform a clone of the git repo and checkout those merges
+ * schedule sequential execution against those merges
+ * notebook is configured to automatically perform merging and upload if
+   the proper environment variables are configured, effectively updating the
+   website on each iteration from the above process
+ * saving of the hyper.json files produced
+ * wiping the cloned git repo
+* A 'lock' file will be created in ~/.cache/deepspeech_wer/ to ensure we do not
+  trigger multiple execution at the same time. Unexpected exception might leave
+  a stale lock file
+* A 'last_sha1' in the same directory will be used to keep track of what has
+  been done last
+* Previous runs' logs will be saved to ~/.local/share/deepspeech_wer/
+* For debugging purpose, `~/.deepspeech_wer.err.log` and `~/.deepspeech_wer.out.log`
+  will collect stderr/stdout
+* Expose those environment variable (please refer to util/website.py to have
+  more details on each) (cron above does it):
+ * ds_website_username
+ * ds_website_privkey
+ * ds_website_server_fqdn
+ * ds_website_server_port
+ * ds_website_server_root
+
+# Setup of web-facing server:
+
+* Ensure existing webroot
+* Generate a SSH key, and upload public key to web-facing server
+* Connect at least one time manually from the training machine to the web-facing
+  server to accept the server host key and populate known_hosts file (pay
+  attention to the FQDN)
+* Make sure that server is configured with proper DirectoryIndex (Apache, or
+  equivalent directive for others), whether system-wide or locally (with a
+  .htaccess for example).
+* Bootstrap with empty index.htm (and populate .htaccess if needed)
+* That should be all. Upon any big changes with the HTML codebase, make sure to
+  cleanup the mess.
diff --git a/bin/update-website.sh b/bin/update-website.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+#
+# Trvial tool to warp manual update of website
+#
+
+set -xe
+
+python util/website.py
diff --git a/index.htm b/index.htm
@@ -4,9 +4,9 @@
     <meta charset="utf-8" />
     <title>DeepSpeech reports</title>
 
-    <link rel="stylesheet" href="resources/bootstrap.min.css">
-    <link rel="stylesheet" href="resources/jquery-ui.min.css">
-    <link rel="stylesheet" href="resources/rickshaw.min.css">
+    <link rel="stylesheet" href="resources/bootstrap.min.css" />
+    <link rel="stylesheet" href="resources/jquery-ui.min.css" />
+    <link rel="stylesheet" href="resources/rickshaw.min.css" />
 
     <script src="resources/jquery-3.1.1.min.js"></script>
     <script src="resources/jquery-ui.min.js"></script>

diff --git a/util/website.py b/util/website.py
@@ -0,0 +1,189 @@
+import os
+import paramiko
+import pysftp
+import sys
+
+from bs4 import BeautifulSoup
+from log import merge_logs
+
+def parse_for_deps(filename):
+    """
+    This takes an HTML file as input and output a list of existing depenencies.
+    Empty output means something was wrong.
+    Dependency is one of: js script loaded, css stylesheet loaded
+    """
+
+    with open(filename, 'r') as code:
+        soup = BeautifulSoup(code.read(), 'html.parser')
+
+    external_links   = filter(lambda x: x is not None, [ link.get('href') for link in soup.find_all('link') ])
+    external_scripts = filter(lambda x: x is not None, [ script.get('src') for script in soup.find_all('script') ])
+
+    # Verify with stat()
+    try:
+        all_resources = map(lambda x: os.stat(x), external_links + external_scripts)
+    except OSError as ex:
+        if ex.errno == 2:
+            print "Missing dependency", ex
+            return []
+        raise ex
+
+    # Try to read the file
+    try:
+        readable_resources = map(lambda x: os.open(x, os.O_RDONLY), external_links + external_scripts)
+        map(lambda x: os.close(x), readable_resources)
+    except OSError as ex:
+        print "Unreadable dependency", ex
+        return []
+
+    # It should be all good.
+    return external_links + external_scripts
+
+def get_ssh(auth_infos):
+    """
+    Connect to SSH server
+    """
+    cinfo = {'host':        auth_infos['ds_website_server_fqdn'],
+             'username':    auth_infos['ds_website_username'],
+             'private_key': auth_infos['ds_website_privkey'],
+             'port':        auth_infos['ds_website_server_port']}
+    return pysftp.Connection(**cinfo)
+
+def verify_ssh_dir(auth_infos):
+    """
+    This should ensure that SSH connection works and that the directory where
+    we want to push data is either empty or contains at least a .htaccess and
+    index.htm file
+    """
+    print "Checking connection: %s@%s:%s (port %d)" % (auth_infos['ds_website_username'], auth_infos['ds_website_server_fqdn'], auth_infos['ds_website_server_root'], auth_infos['ds_website_server_port'])
+
+    try:
+        with get_ssh(auth_infos) as sftp:
+            with sftp.cd(auth_infos['ds_website_server_root']):
+                files = sftp.listdir()
+                if len(files) == 0 or (('.htaccess' in files) and ('index.htm' in files)):
+                    return True
+                else:
+                    print "Invalid content for %s:" % (auth_infos['ds_website_server_root']), files
+                    return False
+    except paramiko.ssh_exception.AuthenticationException as ex:
+        print "Authentication error, please verify credentials"
+        return False
+    except IOError as ex:
+        print "Unable to read a file (private key or invalid path on server ?)", ex
+        return False
+
+    # Should not be reached
+    return False
+
+def push_files_sftp(files, auth_infos):
+    """
+    This will push all of the ``files`` listed to the server root folder.
+    """
+
+    # Directories that we might be required to create
+    dirs = filter(lambda x: len(x) > 0, list(set([ os.path.dirname(x) for x in files ])))
+
+    created = []
+    pushed = []
+
+    try:
+        with get_ssh(auth_infos) as sftp:
+            with sftp.cd(auth_infos['ds_website_server_root']):
+                # Create dirs if needed, they all should depend from the root
+                for dir in dirs:
+                    if not sftp.isdir(dir):
+                        print "Creating directory", dir
+                        sftp.makedirs(dir)
+                        created.append(dir)
+
+                # Push all files, chdir() for each
+                for fil in files:
+                    with sftp.cd(os.path.dirname(fil)):
+                        print "Pushing", fil
+                        sftp.put(fil)
+                        pushed.append(fil)
+
+        return pushed
+    except paramiko.ssh_exception.AuthenticationException as ex:
+        print "Authentication error, please verify credentials"
+        return False
+    except IOError as ex:
+        print "Unable to read a file (private key or invalid path on server ?)", ex
+        return False
+
+    # Should not be reached
+    return False
+
+def maybe_publish(file='index.htm'):
+    """
+    Publishing to the web requires the following env variables to be set
+        'ds_website_username: defines the SSH username to connect to
+        'ds_website_privkey: defines the SSH privkey filename to use for connection
+        'ds_website_server_fqdn: hostname of the SSH server
+        'ds_website_server_port: port of the SSH server, defaults to 22
+        'ds_website_server_root: directory on the server where to push data,
+                               should be either empty, or containing at least
+                               a .htaccess and index.htm file
+    """
+
+    ssh_auth_infos = {
+        'ds_website_username': '',
+        'ds_website_privkey': '',
+        'ds_website_server_fqdn': '',
+        'ds_website_server_port': 22,
+        'ds_website_server_root': ''
+    }
+
+    for key in ssh_auth_infos.keys():
+       vartype = type(ssh_auth_infos[key])
+       value = os.environ.get(key)
+       if value is not None:
+           if vartype == str:
+               ssh_auth_infos[key] = str(os.environ.get(key))
+           elif vartype == int:
+               try:
+                   ssh_auth_infos[key] = int(os.environ.get(key))
+               except TypeError as ex:
+                   print "WARNING:", "Keeping default SSH port value because of:", ex
+
+    missing_env = filter(lambda x: len(str(ssh_auth_infos[x])) == 0, ssh_auth_infos.keys())
+    if len(missing_env) > 0:
+        print "Not publishing, missing some required environment variables:", missing_env
+        print "But maybe this is what you wanted, after all ..."
+        return False
+
+    merge_logs("logs")
+
+    all_deps = parse_for_deps(file)
+
+    if len(all_deps) == 0:
+        print "Problem during deps computation, aborting"
+        return False
+
+    if not verify_ssh_dir(ssh_auth_infos):
+        print "Problem during SSH directory verification, aborting"
+        return False
+
+    all_files = [ file ] + all_deps
+    uploaded = push_files_sftp(all_files, ssh_auth_infos)
+    if len(uploaded) == 0:
+        print "Unable to upload anything"
+        return False
+    elif len(uploaded) != len(all_files):
+        print "Partial upload has been completed:"
+        print "all_files=", all_files
+        print "uploaded=", uploaded
+    else:
+        print "Complete upload has been completed."
+
+    return True
+
+# Support CLI invocation
+if __name__ == "__main__":
+    if maybe_publish():
+        print "All good!"
+        sys.exit(0)
+    else:
+        print "Error happened ..."
+        sys.exit(1)