Skip to content

Commit

Permalink
Handling of website publication
Browse files Browse the repository at this point in the history
Fixes #73
  • Loading branch information
Alexandre Lissy committed Oct 24, 2016
1 parent b7cbf61 commit 781be5d
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 5 deletions.
7 changes: 5 additions & 2 deletions DeepSpeech.ipynb
Expand Up @@ -94,7 +94,9 @@
"ds_dataset_path = os.environ.get('ds_dataset_path', './data/ted')\n",
"\n",
"import importlib\n",
"ds_importer_module = importlib.import_module('util.importers.%s' % ds_importer)"
"ds_importer_module = importlib.import_module('util.importers.%s' % ds_importer)\n",
"\n",
"from util.website import maybe_publish"
]
},
{
Expand Down Expand Up @@ -1411,7 +1413,8 @@
},
"outputs": [],
"source": [
"merge_logs(logs_dir)"
"merge_logs(logs_dir)\n",
"maybe_publish()"
]
}
],
Expand Down
70 changes: 70 additions & 0 deletions README.website.md
@@ -0,0 +1,70 @@
Overview of the process for publishing WER
==========================================

The tracking of WER is made using the following workflow:
* a dedicated user on the learning machine periodically runs training jobs (cron
job, or manual runs)
* this produces, mostly, js/hyper.js containig a concatenated version of all
previous runs
* util/website.py contains code that will connect to a SSH server, using SFTP
* this will publish 'index.html' and its dependencies

# Setup of the dedicated user:

* Create a standard user
* Either rely on system's tensorflow or populate a virtualenv
* Using system tensorflow or a virtualenv might require setting the PYTHONPATH
env variable (done for system wide tensorflow installation in the example
below).
* Install PIP dependencies:
* jupyter
* BeautifulSoup4
* GitPython
* pysftp
* xdg
* requests
* Construct cron job:
```
SHELL=/bin/bash
PATH=/usr/local/bin:/usr/bin/:/bin
# Run WER every 15 mins
*/5 * * * * (mkdir -p $HOME/wer && cd $HOME/wer && source /usr/local/tensorflow-env/bin/activate && /usr/bin/curl -H "Cache-Control: no-cache" -L https://raw.githubusercontent.com/mozilla/DeepSpeech/website/util/automation.py | ds_website_username="u" ds_website_privkey="$HOME/.ssh/k" ds_website_server_fqdn="host.tld" ds_website_server_root="www/" ds_wer_automation="./bin/run-wer-automation.sh" python ; cd) 2>$HOME/.deepspeech_wer.err.log 1>$HOME/.deepspeech_wer.out.log
```
* Cron task will take care of:
* checking if any there were any new merges
* perform a clone of the git repo and checkout those merges
* schedule sequential execution against those merges
* notebook is configured to automatically perform merging and upload if
the proper environment variables are configured, effectively updating the
website on each iteration from the above process
* saving of the hyper.json files produced
* wiping the cloned git repo
* A 'lock' file will be created in ~/.cache/deepspeech_wer/ to ensure we do not
trigger multiple execution at the same time. Unexpected exception might leave
a stale lock file
* A 'last_sha1' in the same directory will be used to keep track of what has
been done last
* Previous runs' logs will be saved to ~/.local/share/deepspeech_wer/
* For debugging purpose, `~/.deepspeech_wer.err.log` and `~/.deepspeech_wer.out.log`
will collect stderr/stdout
* Expose those environment variable (please refer to util/website.py to have
more details on each) (cron above does it):
* ds_website_username
* ds_website_privkey
* ds_website_server_fqdn
* ds_website_server_port
* ds_website_server_root

# Setup of web-facing server:

* Ensure existing webroot
* Generate a SSH key, and upload public key to web-facing server
* Connect at least one time manually from the training machine to the web-facing
server to accept the server host key and populate known_hosts file (pay
attention to the FQDN)
* Make sure that server is configured with proper DirectoryIndex (Apache, or
equivalent directive for others), whether system-wide or locally (with a
.htaccess for example).
* Bootstrap with empty index.htm (and populate .htaccess if needed)
* That should be all. Upon any big changes with the HTML codebase, make sure to
cleanup the mess.
9 changes: 9 additions & 0 deletions bin/update-website.sh
@@ -0,0 +1,9 @@
#!/bin/sh

#
# Trvial tool to warp manual update of website
#

set -xe

python util/website.py
6 changes: 3 additions & 3 deletions index.htm
Expand Up @@ -4,9 +4,9 @@
<meta charset="utf-8" />
<title>DeepSpeech reports</title>

<link rel="stylesheet" href="resources/bootstrap.min.css">
<link rel="stylesheet" href="resources/jquery-ui.min.css">
<link rel="stylesheet" href="resources/rickshaw.min.css">
<link rel="stylesheet" href="resources/bootstrap.min.css" />
<link rel="stylesheet" href="resources/jquery-ui.min.css" />
<link rel="stylesheet" href="resources/rickshaw.min.css" />

<script src="resources/jquery-3.1.1.min.js"></script>
<script src="resources/jquery-ui.min.js"></script>
Expand Down
189 changes: 189 additions & 0 deletions util/website.py
@@ -0,0 +1,189 @@
import os
import paramiko
import pysftp
import sys

from bs4 import BeautifulSoup
from log import merge_logs

def parse_for_deps(filename):
"""
This takes an HTML file as input and output a list of existing depenencies.
Empty output means something was wrong.
Dependency is one of: js script loaded, css stylesheet loaded
"""

with open(filename, 'r') as code:
soup = BeautifulSoup(code.read(), 'html.parser')

external_links = filter(lambda x: x is not None, [ link.get('href') for link in soup.find_all('link') ])
external_scripts = filter(lambda x: x is not None, [ script.get('src') for script in soup.find_all('script') ])

# Verify with stat()
try:
all_resources = map(lambda x: os.stat(x), external_links + external_scripts)
except OSError as ex:
if ex.errno == 2:
print "Missing dependency", ex
return []
raise ex

# Try to read the file
try:
readable_resources = map(lambda x: os.open(x, os.O_RDONLY), external_links + external_scripts)
map(lambda x: os.close(x), readable_resources)
except OSError as ex:
print "Unreadable dependency", ex
return []

# It should be all good.
return external_links + external_scripts

def get_ssh(auth_infos):
"""
Connect to SSH server
"""
cinfo = {'host': auth_infos['ds_website_server_fqdn'],
'username': auth_infos['ds_website_username'],
'private_key': auth_infos['ds_website_privkey'],
'port': auth_infos['ds_website_server_port']}
return pysftp.Connection(**cinfo)

def verify_ssh_dir(auth_infos):
"""
This should ensure that SSH connection works and that the directory where
we want to push data is either empty or contains at least a .htaccess and
index.htm file
"""
print "Checking connection: %s@%s:%s (port %d)" % (auth_infos['ds_website_username'], auth_infos['ds_website_server_fqdn'], auth_infos['ds_website_server_root'], auth_infos['ds_website_server_port'])

try:
with get_ssh(auth_infos) as sftp:
with sftp.cd(auth_infos['ds_website_server_root']):
files = sftp.listdir()
if len(files) == 0 or (('.htaccess' in files) and ('index.htm' in files)):
return True
else:
print "Invalid content for %s:" % (auth_infos['ds_website_server_root']), files
return False
except paramiko.ssh_exception.AuthenticationException as ex:
print "Authentication error, please verify credentials"
return False
except IOError as ex:
print "Unable to read a file (private key or invalid path on server ?)", ex
return False

# Should not be reached
return False

def push_files_sftp(files, auth_infos):
"""
This will push all of the ``files`` listed to the server root folder.
"""

# Directories that we might be required to create
dirs = filter(lambda x: len(x) > 0, list(set([ os.path.dirname(x) for x in files ])))

created = []
pushed = []

try:
with get_ssh(auth_infos) as sftp:
with sftp.cd(auth_infos['ds_website_server_root']):
# Create dirs if needed, they all should depend from the root
for dir in dirs:
if not sftp.isdir(dir):
print "Creating directory", dir
sftp.makedirs(dir)
created.append(dir)

# Push all files, chdir() for each
for fil in files:
with sftp.cd(os.path.dirname(fil)):
print "Pushing", fil
sftp.put(fil)
pushed.append(fil)

return pushed
except paramiko.ssh_exception.AuthenticationException as ex:
print "Authentication error, please verify credentials"
return False
except IOError as ex:
print "Unable to read a file (private key or invalid path on server ?)", ex
return False

# Should not be reached
return False

def maybe_publish(file='index.htm'):
"""
Publishing to the web requires the following env variables to be set
'ds_website_username: defines the SSH username to connect to
'ds_website_privkey: defines the SSH privkey filename to use for connection
'ds_website_server_fqdn: hostname of the SSH server
'ds_website_server_port: port of the SSH server, defaults to 22
'ds_website_server_root: directory on the server where to push data,
should be either empty, or containing at least
a .htaccess and index.htm file
"""

ssh_auth_infos = {
'ds_website_username': '',
'ds_website_privkey': '',
'ds_website_server_fqdn': '',
'ds_website_server_port': 22,
'ds_website_server_root': ''
}

for key in ssh_auth_infos.keys():
vartype = type(ssh_auth_infos[key])
value = os.environ.get(key)
if value is not None:
if vartype == str:
ssh_auth_infos[key] = str(os.environ.get(key))
elif vartype == int:
try:
ssh_auth_infos[key] = int(os.environ.get(key))
except TypeError as ex:
print "WARNING:", "Keeping default SSH port value because of:", ex

missing_env = filter(lambda x: len(str(ssh_auth_infos[x])) == 0, ssh_auth_infos.keys())
if len(missing_env) > 0:
print "Not publishing, missing some required environment variables:", missing_env
print "But maybe this is what you wanted, after all ..."
return False

merge_logs("logs")

all_deps = parse_for_deps(file)

if len(all_deps) == 0:
print "Problem during deps computation, aborting"
return False

if not verify_ssh_dir(ssh_auth_infos):
print "Problem during SSH directory verification, aborting"
return False

all_files = [ file ] + all_deps
uploaded = push_files_sftp(all_files, ssh_auth_infos)
if len(uploaded) == 0:
print "Unable to upload anything"
return False
elif len(uploaded) != len(all_files):
print "Partial upload has been completed:"
print "all_files=", all_files
print "uploaded=", uploaded
else:
print "Complete upload has been completed."

return True

# Support CLI invocation
if __name__ == "__main__":
if maybe_publish():
print "All good!"
sys.exit(0)
else:
print "Error happened ..."
sys.exit(1)

0 comments on commit 781be5d

Please sign in to comment.