Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ edition = "2021"

[dependencies]
anyhow = "1.0.89"
askama = "0.12.1"
axum = "0.7.7"
gethostname = "0.5.0"
log = "0.4.22"
prost = "0.13.3"
pyo3 = {version="0.22.3", features = ["extension-module"]}
Expand Down
58 changes: 55 additions & 3 deletions src/lighthouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

use core::net::SocketAddr;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;

use anyhow::Result;
use askama::Template;
use axum::{response::Html, routing::get, Router};
use gethostname::gethostname;
use log::{error, info};
use structopt::StructOpt;
use tokio::sync::broadcast;
use tokio::sync::Mutex;
use tokio::task::JoinSet;
use tokio::time::sleep;
use tonic::service::Routes;
use tonic::transport::Server;
use tonic::{Request, Response, Status};

Expand Down Expand Up @@ -191,11 +196,33 @@ impl Lighthouse {
}

async fn _run_grpc(self: Arc<Self>) -> Result<()> {
let bind = self.opt.bind.parse()?;
info!("Lighthouse listening on {}", bind);
let bind: SocketAddr = self.opt.bind.parse()?;
info!(
"Lighthouse listening on: http://{}:{}",
gethostname().into_string().unwrap(),
bind.port()
);

let self_clone = self.clone();

// Setup HTTP endpoints
let app = Router::new()
.route(
"/",
get(|| async { Html(IndexTemplate {}.render().unwrap()) }),
)
.route(
"/status",
get(move || async { self_clone.get_status().await }),
);

// register the GRPC service
let routes = Routes::from(app).add_service(LighthouseServiceServer::new(self));

Server::builder()
.add_service(LighthouseServiceServer::new(self))
// allow non-GRPC connections
.accept_http1(true)
.add_routes(routes)
.serve(bind)
.await
.map_err(|e| e.into())
Expand All @@ -213,6 +240,19 @@ impl Lighthouse {
}
Ok(())
}

async fn get_status(self: Arc<Self>) -> Html<String> {
let template = {
let state = self.state.lock().await;

StatusTemplate {
quorum_id: state.quorum_id,
prev_quorum: state.prev_quorum.clone(),
heartbeats: state.heartbeats.clone(),
}
};
Html(template.render().unwrap())
}
}

#[tonic::async_trait]
Expand Down Expand Up @@ -271,6 +311,18 @@ impl LighthouseService for Arc<Lighthouse> {
}
}

#[derive(Template)]
#[template(path = "index.html")]
struct IndexTemplate {}

#[derive(Template)]
#[template(path = "status.html")]
struct StatusTemplate {
prev_quorum: Option<Quorum>,
quorum_id: i64,
heartbeats: HashMap<String, Instant>,
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
57 changes: 57 additions & 0 deletions templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<head>
<title>Lighthouse Dashboard - torchft</title>
<link rel="shortcut icon" type="image/x-icon" href="https://pytorch.org/favicon.ico?">
<style>
body {
margin: 0;
font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";
font-size: 1rem;
font-weight: 400;
line-height: 1.5;
color: #212529;
text-align: left;
background-color: #fff;
}
h1, h2, h3, h4, h5, h6, .h1, .h2, .h3, .h4, .h5, .h6 {
margin-bottom: .5rem;
font-weight: 500;
line-height: 1.2;
}
header {
background-color: rgba(0, 0, 0, 0.17);
padding: 10px;
display: flex;
align-items: center;
padding: 16px;
justify-content: space-between;
}
header h1 {
display: inline-block;
margin: 0;
}
section {
max-width: 1280px;
padding: 16px;
margin: 0 auto;
}
.member {
display: inline-block;
margin: 10px;
padding: 10px;
border: 1px solid #333;
}
.hearbeat.old {
color: red;
}
</style>
<script src="https://unpkg.com/htmx.org@2.0.3"></script>
</head>

<header>
<h1>Lighthouse Dashboard - torchft</h1>
<img src="https://pytorch.org/assets/images/logo.svg" width="128"/>
</header>

<section hx-get="/status" hx-trigger="load, every 1s">
Loading...
</section>
38 changes: 38 additions & 0 deletions templates/status.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<h2>Quorum Status</h2>

Current quorum_id: {{quorum_id}}

<h3>Previous Quorum</h3>
{% if let Some(prev_quorum) = prev_quorum %}

Previous quorum id: {{prev_quorum.quorum_id}}

<div>
{% for member in prev_quorum.participants %}

<div class="member">
<b>{{ member.replica_id }}</b> <br/>
Step: {{ member.step }} <br/>
Manager: {{ member.address }} <br/>
TCPStore: {{ member.store_address }}
</div>

{% endfor %}
</div>

{% endif %}

<h3>Heartbeats</h3>

<ul>
{% for replica_id in heartbeats.keys() %}

{% let age = heartbeats[replica_id].elapsed().as_secs_f64() %}
<li class="heartbeat">
{{ replica_id }}: seen {{ age }}s ago
</li>

{% endfor %}
</ul>


Loading