Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[homematic] Improve (re)connect handling to Homematic gateways #11429

Merged
merged 12 commits into from
Dec 11, 2021
Merged
14 changes: 14 additions & 0 deletions bundles/org.openhab.binding.homematic/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ The port number of the HMIP server (default = 2010)
- **cuxdPort**
The port number of the CUxD daemon (default = 8701)

- **groupPort**
The port number of the Group daemon (default = 9292)

- **callbackRegTimeout**
Maximum time in seconds for callback registration in the Homematic gateway (default = 120s).
For a CCU2, the value may need to be increased to 180s.

- **installModeDuration**
Time in seconds that the controller will be in install mode when a device discovery is initiated (default = 60)

Expand Down Expand Up @@ -676,6 +683,13 @@ Examples: HmIP-BROLL, HmIP-FROLL, HmIP-BBL, HmIP-FBL and HmIP-DRBLI4
| openHAB | 0% | 100% |
| CCU | 100% | 0% |

** The binding does not receive any status changes from the Homematic gateway**

First of all, make sure that none of the ports needed to receive status changes from the gateway are blocked by firewall settings.

If the computer running openHAB has more than one IP address, a wrong one may have been set as receiver for status changes.
In this case change the setting for `callbackHost` to the correct address.

### Debugging and Tracing

If you want to see what's going on in the binding, switch the log level to DEBUG in the Karaf console
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
public class HomematicBindingConstants {

public static final String BINDING_ID = "homematic";
public static final String GATEWAY_POOL_NAME = "homematicGateway";
public static final ThingTypeUID THING_TYPE_BRIDGE = new ThingTypeUID(BINDING_ID, "bridge");
public static final String CONFIG_DESCRIPTION_URI_CHANNEL = "channel-type:homematic:config";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ public class HomematicConfig {
private int bufferSize = 2048;

private HmGatewayInfo gatewayInfo;
private int callbackRegistrationRetries;
private int callbackRegTimeout;

/**
* Returns the Homematic gateway address.
Expand Down Expand Up @@ -115,6 +117,20 @@ public void setBinCallbackPort(int binCallbackPort) {
this.binCallbackPort = binCallbackPort;
}

/**
* Sets timeout for callback registration.
*/
public void setCallbackRegTimeout(int timeout) {
this.callbackRegTimeout = timeout;
}

/**
* Returns timeout for callback registrations.
*/
public int getCallbackRegTimeout() {
return callbackRegTimeout;
}

/**
* Returns the HmGatewayInfo.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*/
package org.openhab.binding.homematic.internal.communicator;

import static org.openhab.binding.homematic.internal.HomematicBindingConstants.*;
import static org.openhab.binding.homematic.internal.misc.HomematicConstants.*;

import java.io.IOException;
Expand Down Expand Up @@ -87,8 +88,8 @@
public abstract class AbstractHomematicGateway implements RpcEventListener, HomematicGateway, VirtualGateway {
private final Logger logger = LoggerFactory.getLogger(AbstractHomematicGateway.class);
public static final double DEFAULT_DISABLE_DELAY = 2.0;
private static final long RESTART_DELAY = 30;
private static final long CONNECTION_TRACKER_INTERVAL_SECONDS = 15;
private static final String GATEWAY_POOL_NAME = "homematicGateway";

private final Map<TransferMode, RpcClient<?>> rpcClients = new HashMap<>();
private final Map<TransferMode, RpcServer> rpcServers = new HashMap<>();
Expand Down Expand Up @@ -189,6 +190,7 @@ public void initialize() throws IOException {
logger.debug("Used Homematic transfer modes: {}", sb.toString());
startClients();
startServers();
registerCallbacks();

if (!config.getGatewayInfo().isHomegear()) {
// delay the newDevice event handling at startup, reduces some API calls
Expand All @@ -211,7 +213,7 @@ public void dispose() {
stopWatchdogs();
sendDelayedExecutor.stop();
receiveDelayedExecutor.stop();
stopServers();
stopServers(true);
stopClients();
devices.clear();
echoEvents.clear();
Expand Down Expand Up @@ -253,25 +255,29 @@ private synchronized void startServers() throws IOException {
rpcServer.start();
}
}
}

private void registerCallbacks() throws IOException {
for (HmInterface hmInterface : availableInterfaces.keySet()) {
getRpcClient(hmInterface).init(hmInterface, hmInterface.toString() + "-" + id);
getRpcClient(hmInterface).init(hmInterface);
}
}

/**
* Stops the Homematic RPC server.
*/
private synchronized void stopServers() {
for (HmInterface hmInterface : availableInterfaces.keySet()) {
try {
getRpcClient(hmInterface).release(hmInterface);
} catch (IOException ex) {
// recoverable exception, therefore only debug
logger.debug("Unable to release the connection to the gateway with id '{}': {}", id, ex.getMessage(),
ex);
private synchronized void stopServers(boolean releaseConnection) {
if (releaseConnection) {
for (HmInterface hmInterface : availableInterfaces.keySet()) {
try {
getRpcClient(hmInterface).release(hmInterface);
} catch (IOException ex) {
// recoverable exception, therefore only debug
logger.debug("Unable to release the connection to the gateway with id '{}': {}", id,
ex.getMessage(), ex);
}
}
}

for (TransferMode mode : rpcServers.keySet()) {
rpcServers.get(mode).shutdown();
}
Expand Down Expand Up @@ -896,10 +902,14 @@ private int translateFlags(boolean reset, boolean force, boolean defer) {
*/
private class ConnectionTrackerThread implements Runnable {
private boolean connectionLost;
private boolean reStartPending = false;

@Override
public void run() {
try {
if (reStartPending) {
return;
}
ListBidcosInterfacesParser parser = getRpcClient(getDefaultInterface())
.listBidcosInterfaces(getDefaultInterface());
Integer dutyCycleRatio = parser.getDutyCycleRatio();
Expand All @@ -920,6 +930,11 @@ private void connectionConfirmed() {
if (connectionLost) {
connectionLost = false;
logger.info("Connection resumed on gateway '{}'", id);
try {
registerCallbacks();
} catch (IOException e) {
logger.warn("Connection only partially restored. It is recommended to restart the binding");
}
gatewayAdapter.onConnectionResumed();
}
}
Expand All @@ -930,10 +945,19 @@ private void handleInvalidConnection(String cause) throws IOException {
logger.warn("Connection lost on gateway '{}', cause: \"{}\"", id, cause);
gatewayAdapter.onConnectionLost();
}
stopServers();
stopServers(false);
stopClients();
startClients();
startServers();
reStartPending = true;
logger.debug("Waiting {}s until restart attempt", RESTART_DELAY);
scheduler.schedule(() -> {
try {
startClients();
startServers();
} catch (IOException e) {
logger.debug("Restart failed: {}", e.getMessage());
}
reStartPending = false;
}, RESTART_DELAY, TimeUnit.SECONDS);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public BinRpcClient(HomematicConfig config) {

@Override
public void dispose() {
super.dispose();
socketHandler.flush();
}

Expand All @@ -54,8 +55,8 @@ protected String getRpcCallbackUrl() {
}

@Override
public void init(HmInterface hmInterface, String clientId) throws IOException {
super.init(hmInterface, clientId);
public void init(HmInterface hmInterface) throws IOException {
super.init(hmInterface);
socketHandler.removeSocket(config.getRpcPort(hmInterface));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.openhab.binding.homematic.internal.HomematicBindingConstants;
import org.openhab.binding.homematic.internal.common.HomematicConfig;
Expand All @@ -41,6 +48,7 @@
import org.openhab.binding.homematic.internal.model.HmInterface;
import org.openhab.binding.homematic.internal.model.HmParamsetType;
import org.openhab.binding.homematic.internal.model.HmRssiInfo;
import org.openhab.core.common.ThreadPoolManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -53,18 +61,18 @@ public abstract class RpcClient<T> {
private final Logger logger = LoggerFactory.getLogger(RpcClient.class);
protected static final int MAX_RPC_RETRY = 3;
protected static final int RESP_BUFFER_SIZE = 8192;
private static final int INITIAL_CALLBACK_REG_DELAY = 20; // 20 s before first attempt
private static final int CALLBACK_REG_DELAY = 10; // 10 s between two attempts

protected HomematicConfig config;
private String thisUID = UUID.randomUUID().toString();
private ScheduledFuture<?> future = null;
private int attempt;

public RpcClient(HomematicConfig config) {
this.config = config;
}

/**
* Disposes the client.
*/
public abstract void dispose();

/**
* Returns a RpcRequest for this client.
*/
Expand All @@ -83,14 +91,48 @@ public RpcClient(HomematicConfig config) {
/**
* Register a callback for the specified interface where the Homematic gateway can send its events.
*/
public void init(HmInterface hmInterface, String clientId) throws IOException {
public void init(HmInterface hmInterface) throws IOException {
ScheduledExecutorService scheduler = ThreadPoolManager.getScheduledPool(GATEWAY_POOL_NAME);
RpcRequest<T> request = createRpcRequest("init");
request.addArg(getRpcCallbackUrl());
request.addArg(clientId);
request.addArg(thisUID);
if (config.getGatewayInfo().isHomegear()) {
request.addArg(Integer.valueOf(0x22));
}
sendMessage(config.getRpcPort(hmInterface), request);
logger.debug("Register callback for interface {}", hmInterface.getName());
try {
attempt = 1;
sendMessage(config.getRpcPort(hmInterface), request); // first attempt without delay
} catch (IOException e) {
future = scheduler.scheduleWithFixedDelay(() -> {
logger.debug("Register callback for interface {}, attempt {}", hmInterface.getName(), ++attempt);
try {
sendMessage(config.getRpcPort(hmInterface), request);
future.cancel(true);
} catch (IOException ex) {
// Ignore, retry
}
}, INITIAL_CALLBACK_REG_DELAY, CALLBACK_REG_DELAY, TimeUnit.SECONDS);
try {
future.get(config.getCallbackRegTimeout(), TimeUnit.SECONDS);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get() will block forever when called on a future returned by scheduleWithFixedDelay(). Excerpt from the Javadoc:

Returns: a ScheduledFuture representing pending completion ofthe series of repeated tasks. The future's get() method will never return normally,and will throw an exception upon task cancellation orabnormal termination of a task execution.

This is called from the handler's initialize(), thus initialize() will block forever. Why not removing the entire try block?

You need to cancel the task when the handler is disposed.

Are you registering the callbacks repeatedly by intention?

Copy link
Contributor Author

@MHerbst MHerbst Dec 11, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is called from the handler's initialize(), thus initialize() will block forever. Why not removing the entire try block?
I have tried it, but without the blocking get, there can be up to 4 parallel requests to different ports (and services) on the Homematic gateway and the gateway simply does not like this. I have tried it this way and it did not work. In some tests I even had to hard reset the gateway because it was completely stuck.

The current implementation of the binding needs a more or less synchronous execution.
I am not happy with the solution, but with the current implementation, I really see no other way to implement it :-(.

This is called from the handler's initialize(), thus initialize()

As far as I can see the initialize in the Bridge handler (HomematicBridgeHandler#initialize) is not blocked and finishes fast (I have verified it in my tests).

Are you registering the callbacks repeatedly by intention?

The callback registration is executed after the binding was started or after the connection to the HM gateway was lost (e.g. if the gateway was restarted). In the first case, the first registration request will be successful and the scheduler is not used.

In the latter case, the binding performs a complete reconnect to the Homematic gateway. This includes a new registration of the callbacks. For each possible connection type (HM-RF, HmIP, Groups, CuxD) a separate callback must be registered because on the gateway, a service (daemon) is executed for each connection type listening on different ports. The HmIP service e.g. needs up to 2 min. until it is ready for callback registration requests. Therefore, the binding needs to perform these retries. There is no other way to figure out when a service on the gateway is ready to accept requests.

It is even worse. In the first seconds after the gateway restart you even don't get a correct reply to the XML RPC requests. Instead the gateway returns a HTML document with a message "please wait ...".

In my opinion, major parts of the binding's implementation need a complete overhaul. I would also be better to create a bridge for each of the possible connection types (would be a breaking change :-()
But such an overhaul requires time especially for testing.

You need to cancel the task when the handler is disposed.

OK, will check this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, major parts of the binding's implementation need a complete overhaul.

I agree on that one. Also the channel creation stuff that is "fixed" in another PR is kinda broken by design and fixing that properly would require huge changes. So now we seem to have 2 changes/fixes that would ideally be handled with a complete rewrite of most of the binding....

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking into the code again, I saw that the Future cancels itself when the connection has been established successfully. So, initialize() will block as long as the remote side is unavailable. The current implementation calls sendMessage() only once (which is also bad in initialize()).

So, if I see correctly your code will block initialize() forever when the remote side is not available.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fwolter A sendMessage during the Bridge initialization it is not blocking the handler initialization, as you can see in HomematicBridgeHandler#initialize:

    public void initialize() {
        synchronized (initDisposeLock) {
            isDisposed = false;
            initializeFuture = scheduler.submit(this::initializeInternal);
        }
    }

You can believe me, I have tested it (several times).

The binding works correctly. If the gateway is not accessible when the binding is started, all things are set to state "ERROR:BRIDGE". After the gateway comes up this is correctly detected and the thing's state changes to ONLINE.

I know, this solution is at least in parts "ugly as hell". But I have invested really hours of testing and trying to find a better solution without having to rewrite larger part of the binding. Now am I am really fed up with these connection problems and I for the next couple of weeks I don't have the time to work on it.

If you can't accept the PR in its current state, it is OK to me. But in this case, it will remain in this state for at least a month. Or someone else continues to work on this problems.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't notice that initialize() wasn't the initialize() from the ThingHandler, but from a custom class. Would be awesome if this code could be re-written sometime!

} catch (CancellationException e1) {
logger.debug("Callback for interface {} successfully registered", hmInterface.getName());
} catch (InterruptedException | ExecutionException e1) {
throw new IOException("Callback reg. thread interrupted", e1);
} catch (TimeoutException e1) {
logger.error("Callback registration for interface {} timed out", hmInterface.getName());
throw new IOException("Unable to reconnect in time");
}
future = null;
}
}

/**
* Disposes the client.
*/
public void dispose() {
if (future != null)
future.cancel(true);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ public XmlRpcClient(HomematicConfig config, HttpClient httpClient) throws IOExce
this.httpClient = httpClient;
}

@Override
public void dispose() {
}

@Override
public RpcRequest<String> createRpcRequest(String methodName) {
return new XmlRpcRequest(methodName);
Expand Down Expand Up @@ -87,10 +83,13 @@ protected synchronized Object[] sendMessage(int port, RpcRequest<String> request
throw new IOException(ex);
} catch (IOException ex) {
reason = ex;
if ("init".equals(request.getMethodName())) { // no retries for "init" request
// no retries for "init" request or if connection is refused
if ("init".equals(request.getMethodName())
|| ex.getCause() != null && ex.getCause() instanceof ExecutionException) {
break;
}
logger.debug("XmlRpcMessage failed, sending message again {}/{}", rpcRetryCounter, MAX_RPC_RETRY);
logger.debug("XmlRpcMessage failed({}), sending message again {}/{}", ex.getMessage(), rpcRetryCounter,
MAX_RPC_RETRY);
}
}
throw reason;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@
<advanced>true</advanced>
<default>9292</default>
</parameter>
<parameter name="callbackRegTimeout" type="integer" min="30">
<label>Callback Reg. Timeout</label>
<description>Maximum time in seconds for callback registration in the Homematic gateway.</description>
<advanced>true</advanced>
<default>120</default>
</parameter>
<parameter name="installModeDuration" type="integer" min="10" max="300" unit="s">
<label>Install Mode Duration</label>
<description>Time in seconds that the controller will be in install mode when a device discovery is initiated</description>
Expand Down