Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
egress ip: conditionally use grpc for health monitoring
When ovnkube container, in both master and node pods, is started with the newly introduced flag 'egressip-node-healthcheck-port', egressip implementation will now use gRPC with that parameter. Signed-off-by: Flavio Fernandes <flaviof@redhat.com>
- Loading branch information
1 parent
c48e6c5
commit 2773b7f
Showing
5 changed files
with
242 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
166 changes: 166 additions & 0 deletions
166
go-controller/pkg/ovn/healthcheck/egressip_healthcheck.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
package healthcheck | ||
|
||
import ( | ||
"fmt" | ||
"net" | ||
"sync" | ||
|
||
"golang.org/x/net/context" | ||
"google.golang.org/grpc" | ||
"google.golang.org/grpc/credentials/insecure" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
const ( | ||
ServiceEgressIpNode = "Service_Egress_IP" | ||
) | ||
|
||
// UnimplementedHealthServer must be embedded to have forward compatible implementations. | ||
type healthServer struct { | ||
UnimplementedHealthServer | ||
} | ||
|
||
func (healthServer) Check(_ context.Context, req *HealthCheckRequest) (*HealthCheckResponse, error) { | ||
response := HealthCheckResponse{} | ||
|
||
if req.GetService() == ServiceEgressIpNode { | ||
response.Status = HealthCheckResponse_SERVING | ||
} else { | ||
response.Status = HealthCheckResponse_NOT_SERVING | ||
} | ||
return &response, nil | ||
} | ||
|
||
type egressIPHealthServer struct { | ||
// Management port bound by server | ||
node_mgmt_ip net.IP | ||
|
||
// EgressIP Node reachability gRPC port (0 means it should use dial instead) | ||
health_check_port int | ||
} | ||
|
||
func NewEgressIPHealthServer(node_mgmt_ip net.IP, health_check_port int) (*egressIPHealthServer, error) { | ||
return &egressIPHealthServer{ | ||
node_mgmt_ip: node_mgmt_ip, | ||
health_check_port: health_check_port, | ||
}, nil | ||
} | ||
|
||
func (ehs *egressIPHealthServer) Run(stopCh <-chan struct{}) { | ||
lis, err := net.Listen("tcp", fmt.Sprintf("%s:%d", ehs.node_mgmt_ip.String(), ehs.health_check_port)) | ||
if err != nil { | ||
klog.Fatalf("Health checking listen failed: %v", err) | ||
return | ||
} | ||
|
||
wg := &sync.WaitGroup{} | ||
|
||
// TODO (FF): add TLS credentials support | ||
grpcServer := grpc.NewServer() | ||
|
||
wg.Add(1) | ||
go func() { | ||
defer wg.Done() | ||
|
||
RegisterHealthServer(grpcServer, &healthServer{}) | ||
klog.Infof("Starting Egress IP Health Server on %s:%d", ehs.node_mgmt_ip.String(), ehs.health_check_port) | ||
if err := grpcServer.Serve(lis); err != nil && err != grpc.ErrServerStopped { | ||
klog.Fatalf("Health checking serve failed: %v", err) | ||
} | ||
klog.Infof("Stopped Egress IP Health Server on %s:%d", ehs.node_mgmt_ip.String(), ehs.health_check_port) | ||
}() | ||
|
||
<-stopCh | ||
|
||
klog.Info("Shutting down Egress IP Health Server") | ||
grpcServer.Stop() | ||
wg.Wait() | ||
klog.Info("Shut down Egress IP Health Server") | ||
} | ||
|
||
type EgressIPHealthClient interface { | ||
IsConnected() bool | ||
Connect(dial_ctx context.Context, mgmtIPs []net.IP, health_check_port int) bool | ||
Disconnect() | ||
Probe(dial_ctx context.Context) bool | ||
} | ||
|
||
type egressIPHealthClient struct { | ||
nodeName string // debug | ||
nodeAddr string // debug | ||
conn *grpc.ClientConn | ||
// the probe_failed state is used to mitigate situations when | ||
// connection just went down. With that, we do not declare node | ||
// unreachable unless connection could not be re-established | ||
probe_failed bool | ||
} | ||
|
||
func NewEgressIPHealthClient(nodeName string) EgressIPHealthClient { | ||
return &egressIPHealthClient{nodeName: nodeName} | ||
} | ||
|
||
func (ehc *egressIPHealthClient) IsConnected() bool { | ||
return ehc.conn != nil | ||
} | ||
|
||
func (ehc *egressIPHealthClient) Connect(dial_ctx context.Context, mgmtIPs []net.IP, health_check_port int) bool { | ||
|
||
var conn *grpc.ClientConn | ||
var node_addr string | ||
var err error | ||
|
||
for _, node_mgmt_ip := range mgmtIPs { | ||
options := []grpc.DialOption{ | ||
grpc.WithBlock(), | ||
// TODO (FF): replace with TLS credentials | ||
grpc.WithTransportCredentials(insecure.NewCredentials()), | ||
} | ||
node_addr = fmt.Sprintf("%s:%d", node_mgmt_ip.String(), health_check_port) | ||
conn, err = grpc.DialContext(dial_ctx, node_addr, options...) | ||
if err == nil && conn != nil { | ||
break | ||
} | ||
} | ||
if conn == nil { | ||
klog.Warningf("Could not connect to %s (%s): %v", ehc.nodeName, node_addr, err) | ||
return false | ||
} | ||
|
||
klog.Infof("Connected to %s (%s)", ehc.nodeName, node_addr) | ||
ehc.nodeAddr = node_addr | ||
ehc.conn = conn | ||
return true | ||
} | ||
|
||
func (ehc *egressIPHealthClient) Disconnect() { | ||
if ehc.conn != nil { | ||
klog.Infof("Closing connection with %s (%s)", ehc.nodeName, ehc.nodeAddr) | ||
ehc.conn.Close() | ||
ehc.conn = nil | ||
} | ||
} | ||
|
||
func (ehc *egressIPHealthClient) Probe(dial_ctx context.Context) bool { | ||
|
||
if ehc.conn == nil { | ||
// should never happen | ||
klog.Warningf("Unexpected probing before connecting %s", ehc.nodeName) | ||
return false | ||
} | ||
|
||
response, err := NewHealthClient(ehc.conn).Check(dial_ctx, &HealthCheckRequest{Service: ServiceEgressIpNode}) | ||
if err != nil { | ||
// check failed. What we will return here will depend on ehc.probe_failed. If this is the first failure, | ||
// let's tolerate it to account for cases where session went down and we just need it re-established. | ||
// Otherwise, declare it failed. | ||
klog.V(5).Infof("Probe failed %s (%s): %s", ehc.nodeName, ehc.nodeAddr, err) | ||
ehc.Disconnect() | ||
prev_probe_failed := ehc.probe_failed | ||
ehc.probe_failed = true | ||
return !prev_probe_failed | ||
} | ||
|
||
ehc.probe_failed = false | ||
klog.V(5).Infof("Got response from %s (%s): %v", ehc.nodeName, ehc.nodeAddr, response.GetStatus()) | ||
return response.GetStatus() == HealthCheckResponse_SERVING | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters