In [10]:
import polars as pl
import numpy as np

In [3]:
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLMResults
from statsmodels.genmod.families import family

In [71]:
df = pl.read_csv('../rand-data-1.csv')

In [65]:
#df = df.with_columns(pl.all().name.prefix('data_'))
#df = df.with_columns(glm_intercept=pl.lit(1))

In [102]:
glm_model = sm.GLM(df['Y'].to_pandas(), df[['Z']].to_pandas(), family=family.Gaussian())

In [98]:
glm_model = sm.GLM(df['Y'].to_pandas(), df[['Z', 'X']].to_pandas(), family=family.Binomial())


In [91]:
# Assuming you want to set the beta for 'x' to 0.5
R = np.array([[1, 0], [0, 1]])  # This selects the 'x' coefficient
r = np.array([2, 0.5])     # This is the value you want to set for beta

In [94]:
# Assuming you want to set the beta for 'x' to 0.5
R = np.array([[1]])  # This selects the 'x' coefficient
r = np.array([0.5])     # This is the value you want to set for beta

In [103]:
beta0 = np.array([2])

In [113]:
glm_results = GLMResults(glm_model, beta0, normalized_cov_params=None, scale=None)

In [114]:
glm_results.params

array([2])

In [115]:
glm_results.summary()

AttributeError: 'GLMResults' object has no attribute 'method'

In [116]:
derivative_inverse_link = glm_results.family.link.inverse_deriv
derivative_inverse_link

<bound method Power.inverse_deriv of <statsmodels.genmod.families.links.Identity object at 0x28d614a00>>

In [117]:
eta = glm_results.predict(which='linear')  # Get the linear predictor
dmu_deta = derivative_inverse_link(eta)
dmu_deta

array([1., 1., 1., ..., 1., 1., 1.])

In [118]:
len(dmu_deta)

100000

In [25]:
import numpy as np

In [26]:
eta = np.array([0.1, 0.3, 0.5, 0.7, 0.9]) # X * beta

In [27]:
mu = sm.families.links.Logit().deriv(eta)

array([11.11111111,  4.76190476,  4.        ,  4.76190476, 11.11111111])

In [1]:
import polars as pl
import numpy as np
import pickle
from typing import List
from itertools import chain, combinations

In [2]:
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLMResults
from statsmodels.genmod.families import family

In [3]:
class TestingRound:
    iterations = 0
    last_deviance = None
    deviance = 0
    convergence_threshold = 1e-8
    
    def __init__(self, y_label, X_labels):
        self.y_label = y_label
        self.X_labels = X_labels
        self._init_beta0()
        
    def __eq__(self, t):
        assert type(t) == TestingRound
        return self.y_label == t.y_label and self.X_labels == t.X_labels   
    
    def __repr__(self):
        return f'TestingRound - y: {self.y_label}, X: {self.X_labels}, beta: {self.beta}, current iteration: {self.iterations}, current deviance: {abs(self.deviance)}, relative deviance change: {abs(self.deviance - self.last_deviance) / (0.1 + abs(self.deviance)) if self.last_deviance is not None else "?"}' 
    
    def _init_beta0(self):
        #self.beta = np.random.randn(len(self.X_labels) + 1) # +1 for intercept
        self.beta = np.zeros(len(self.X_labels) + 1) # +1 for intercept
    
    def aggregate_results(self, results):
        results1, results2, deviances = zip(*results)
        #print('---')
        #print(np.linalg.inv(sum(results1)))
        #for r in results2:
        #    print(r)
        #print(sum(results2))
        self.beta = np.linalg.inv(sum(results1)) @ sum(results2)
        self.last_deviance = self.deviance
        self.deviance = sum(deviances)
        self.iterations += 1
        
        return abs(self.deviance - self.last_deviance) / (0.1 + abs(self.deviance)) < self.convergence_threshold
    
class FederatedGLMResult:
    def __init__(self, testing_round: TestingRound):
        self.y_label = testing_round.y_label
        self.X_labels = testing_round.X_labels
        self.beta = testing_round.beta
        self.deviance = testing_round.deviance
        
    def __eq__(self, x):
        assert type(x) == FederatedGLMResult
        return self.y_label == x.y_label and self.X_labels == x.X_labels and self.beta == x.beta and self.deviance == x.deviance
    
class TestingEngine:
    testing_rounds = []
    finished_rounds = []
    
    def __init__(self, available_data, max_regressors=None, max_iterations=25, save_steps=10):
        self.available_data = available_data
        self.max_regressors = max_regressors
        self.max_iterations = max_iterations
        self.save_steps = save_steps
        
        _max_regressors = min(len(self.available_data), self.max_regressors+1) if self.max_regressors is not None else len(self.available_data)
        
        for e in available_data:
            set_of_regressors = available_data - {e}
            powerset_of_regressors = chain.from_iterable(combinations(set_of_regressors, r) for r in range(1,_max_regressors))
            self.testing_rounds.extend([TestingRound(y_label=e, X_labels=list(r)) for r in powerset_of_regressors])
            
        self.testing_rounds = sorted(self.testing_rounds, key=lambda key: len(key.X_labels))
        self.is_finished = len(self.testing_rounds) == 0
            
    def get_current_test_parameters(self):
        curr_testing_round = self.testing_rounds[0]
        return curr_testing_round.y_label, curr_testing_round.X_labels, curr_testing_round.beta
    
    def finish_current_test(self):
        self.finished_rounds.append(self.testing_rounds.pop(0))
        self.is_finished = len(self.testing_rounds) == 0
        
    def aggregate_results(self, results):
        has_converged = self.testing_rounds[0].aggregate_results(results)
        has_reached_max_iterations = self.testing_rounds[0].iterations >= self.max_iterations
        print(self.testing_rounds[0])
        if has_converged or has_reached_max_iterations:
            self.finish_current_test()
        

class Server:
    clients = {}
    
    def __init__(self, clients):
        self.clients = clients
        self.available_data = set.union(*[set(c.data_labels) for c in self.clients.values()])
        self.testing_engine = TestingEngine(self.available_data, max_regressors=1)
        
    def run_tests(self):
        counter = 1
        while not self.testing_engine.is_finished:
            y_label, X_labels, beta = self.testing_engine.get_current_test_parameters()
            selected_clients = {id_: c for id_, c in self.clients.items() if set([y_label] + X_labels).issubset(c.data_labels)}
            # http response, to compute glm results for y regressed on X with beta
            results = [c.compute(y_label, X_labels, beta) for c in selected_clients.values()]
            self.testing_engine.aggregate_results(results)
            if counter % self.testing_engine.save_steps == 0:
                counter = 0
                with open('./testengine.ckp', 'wb') as f:
                    pickle.dump(self.testing_engine, f)
            counter += 1
                
    
class Client:
    def __init__(self, data):
        self.data = data
        self.data_labels = data.columns
        
        
    def compute(self, y_label: str, X_labels: List[str], beta):
        y = self.data[y_label]
        X = self.data[X_labels]
        
        X = X.to_numpy()
        X = sm.tools.add_constant(X)
                
        eta, mu, dmu_deta, deviance = self._init_compute(y,X,beta)
        
        #print('eta', eta)
        #print('mu', mu)
        #print('dmu/deta', dmu_deta)
        
        z = eta + (y - mu)/dmu_deta
        W = np.diag((dmu_deta**2)/max(np.var(mu), 1e-8))
        
        #print(z)
        
        r1 = X.T @ W @ X
        r2 = X.T @ W @ z
        
        
        # TODO: r1 und r2 immer gleich, egal welches beta
        
        #print('r1', r1)
        #print('r2', r2)
        
        return r1, r2, deviance
        
        
    def _init_compute(self, y, X, beta):
        glm_model = sm.GLM(y, X, family=family.Gaussian())
        normalized_cov_params = np.linalg.inv(X.T.dot(X))
        scale = glm_model.fit().scale
        glm_results = GLMResults(glm_model, beta, normalized_cov_params=normalized_cov_params, scale=None)
        
        # GLMResult with correct scale
        scale = glm_model.estimate_scale(glm_results.predict(which='linear'))
        glm_results = GLMResults(glm_model, beta, normalized_cov_params=normalized_cov_params, scale=scale)
        
        eta = glm_results.predict(which='linear')
        
        # g' is inverse of link function
        inverse_link = glm_results.family.link.inverse
        mu = inverse_link(eta)
        
        deviance = glm_results.deviance
        
        # delta g' is derivative of inverse link function
        derivative_inverse_link = glm_results.family.link.inverse_deriv
        dmu_deta = derivative_inverse_link(eta)
        return eta, mu, dmu_deta, deviance
    

In [4]:
df = pl.read_csv('../rand-data-1.csv')
df = df[['Y', 'X']]
c1 = Client(df.to_pandas())

In [5]:
df = pl.read_csv('../rand-data-2.csv')
df = df[['Y', 'X']]
c2 = Client(df.to_pandas())

In [6]:
df = pl.read_csv('../random-data-3.csv')
df = df[['Y', 'X']]
c3 = Client(df.to_pandas())

In [6]:
server = Server({1: c1, 2: c2})#, 3: c3})

In [7]:
server.run_tests()

TestingRound - y: X, X: ['Y'], beta: [ 0.00190917 -0.12199378], current iteration: 1, current deviance: 200243.8786699699, relative deviance change: 0.9999995006092035
TestingRound - y: X, X: ['Y'], beta: [ 0.00191739 -0.12237386], current iteration: 2, current deviance: 197274.86750913225, relative deviance change: 0.015050116080745113
TestingRound - y: X, X: ['Y'], beta: [ 0.00191739 -0.12237386], current iteration: 3, current deviance: 197274.8963357231, relative deviance change: 1.4612389504854098e-07
TestingRound - y: X, X: ['Y'], beta: [ 0.00191739 -0.12237386], current iteration: 4, current deviance: 197274.89633572308, relative deviance change: 1.4752924089378632e-16
TestingRound - y: Y, X: ['X'], beta: [-0.00129733 -0.12150326], current iteration: 1, current deviance: 199438.33130657522, relative deviance change: 0.9999994985921251
TestingRound - y: Y, X: ['X'], beta: [-0.00129665 -0.12142273], current iteration: 2, current deviance: 196481.6534200435, relative deviance change

In [None]:
---
[-9.99927237e+04 -5.24604438e+01]
[-100317.73809537  -24698.07765593]
[-100378.55452192  -12122.88313476]
[-300689.01633569  -36873.42123449]
TestingRound - y: Y, X: ['X'], beta: [-1.00215417 -0.12168587], current iteration: 1, current deviance: 299458.9091783228, relative deviance change: 0.9999996660644799
---
[-200197.14371281  -12136.98947161]
[-200595.22463893  -37401.01196537]
[-200585.66431964  -24208.84103201]
[-601378.03267137  -73746.84246899]
TestingRound - y: Y, X: ['X'], beta: [-2.00430834 -0.24337174], current iteration: 2, current deviance: 595013.2161639454, relative deviance change: 0.4967188110865536
---
[-300401.56370722  -24221.51849942]
[-300872.71118249  -50103.9462748 ]
[-300792.77411735  -36294.79892926]
[-902067.04900706 -110620.26370348]
TestingRound - y: Y, X: ['X'], beta: [-3.00646251 -0.3650576 ], current iteration: 3, current deviance: 1502214.9745210703, relative deviance change: 0.6039093693999542

In [12]:
-5.24604438e+01 - -12136.98947161, -12136.98947161 - -24221.51849942

(12084.52902781, 12084.529027809998)

In [13]:
-24698.07765593 - -37401.01196537, -37401.01196537 - -50103.9462748

(12702.934309440003, 12702.934309429998)

In [14]:
-12122.88313476 - -24208.84103201, -24208.84103201 - -36294.79892926

(12085.95789725, 12085.957897250002)

In [1]:
TestingRound - y: Y, X: ['X'], beta: [-1.00129733 -0.12150326], current iteration: 1, current deviance: 199438.33130657522, relative deviance change: 0.9999994985921251
TestingRound - y: Y, X: ['X'], beta: [-2.00259466 -0.24300653], current iteration: 2, current deviance: 396481.6534200435, relative deviance change: 0.49697954676041617
TestingRound - y: Y, X: ['X'], beta: [-3.00389199 -0.36450979], current iteration: 3, current deviance: 1000680.1785616275, relative deviance change: 0.6037877812582214
TestingRound - y: Y, X: ['X'], beta: [-4.00518932 -0.48601305], current iteration: 4, current deviance: 2012033.9067313261, relative deviance change: 0.5026524028849321
TestingRound - y: Y, X: ['X'], beta: [-5.00648664 -0.60751631], current iteration: 5, current deviance: 3430542.837929135, relative deviance change: 0.4134940028047278

-5.00648665

In [None]:
# value of r2 doubles! WHY?!

[[5.00001208e-06 7.78342262e-09]
 [7.78342262e-09 5.01409338e-06]]
[-199580.29097945  -24020.36726541]
TestingRound - y: X, X: ['Y'], beta: [-0.99809083 -0.12199378], current iteration: 1, current deviance: 200243.8786699699, relative deviance change: 0.9999995006092035
[[5.00001208e-06 7.78342262e-09]
 [7.78342262e-09 5.01409338e-06]]
[-399160.58195889  -48040.73453083]
TestingRound - y: X, X: ['Y'], beta: [-1.99618165 -0.24398756], current iteration: 2, current deviance: 397274.8675091318, relative deviance change: 0.4959562140915232

In [None]:
0       -2.545943
1       -0.361617
2        0.036539
3       -1.585862
4       -2.348629
           ...   
99995    0.094140
99996    0.280835
99997    0.284479
99998   -1.091972
99999   -1.101109
Name: X, Length: 100000, dtype: float64

0        0.928202
1       -1.022850
2       -1.459742
3       -0.898023
4       -1.055092
           ...   
99995    0.066543
99996   -1.232357
99997   -3.123878
99998    0.378372
99999   -1.262961
Name: X, Length: 100000, dtype: float64

In [None]:
0       -3.566477
1       -1.225785
2       -0.811946
3       -2.580938
4       -3.395498
           ...   
99995   -1.013917
99996   -0.561535
99997   -0.891457
99998   -2.152535
99999   -2.143902
Name: X, Length: 100000, dtype: float64

0        0.138013
1       -2.227509
2       -2.629769
3       -1.982297
4       -2.199608
           ...   
99995   -0.746711
99996   -2.236666
99997   -4.337437
99998   -0.635011
99999   -2.552829
Name: X, Length: 100000, dtype: float64

In [None]:
# r2 c1 and c2 vals plus sum

---
[-9.99927237e+04 -5.24604438e+01]
[-100317.73809537  -24698.07765593]
[-200310.46181376  -24750.53809973]
TestingRound - y: Y, X: ['X'], beta: [-1.00129733 -0.12150326], current iteration: 1, current deviance: 199438.33130657522, relative deviance change: 0.9999994985921251
---
[-200111.47617361  -12118.79659865]
[-200509.44745392  -37382.27960081]
[-400620.92362753  -49501.07619946]
TestingRound - y: Y, X: ['X'], beta: [-2.00259466 -0.24300653], current iteration: 2, current deviance: 396481.6534200439, relative deviance change: 0.4969795467604166
---
[-300230.22862883  -24185.13275351]
[-300701.15681246  -50066.48154569]
[-600931.38544129  -74251.6142992 ]
TestingRound - y: Y, X: ['X'], beta: [-3.00389199 -0.36450979], current iteration: 3, current deviance: 1000680.1785616286, relative deviance change: 0.6037877812582214
---
[-400348.98108405  -36251.46890837]
[-400892.866171    -62750.68349056]
[-801241.84725505  -99002.15239893]
TestingRound - y: Y, X: ['X'], beta: [-4.00518932 -0.48601305], current iteration: 4, current deviance: 2012033.9067313308, relative deviance change: 0.5026524028849326

In [1]:
-24750.53809973 - -12118.79659865

-12631.741501080001

In [2]:
-24750.53809973 - -37382.27960081

12631.741501079996

In [None]:
# Differences from C1 and C2 in beta directly cancel out in sum, so they sum up to t*beta1

In [None]:
import polars as pl
import numpy as np

In [None]:
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLMResults
from statsmodels.genmod.families import family

In [None]:
#Differences from c1 and c2 EXACTLY cancel out!

In [2]:
# its just a multiple of the first beta
-1.00129733 + -2.00259466

-3.0038919900000005

In [3]:
3 * -1.00129733

-3.0038919900000005

In [3]:
df = pl.read_csv('../rand-data-1.csv').to_pandas()
X1 = df[['X']].to_numpy()
X1 = sm.tools.add_constant(X1)
y1 = df['Y'].to_numpy()

In [13]:
df1 = pl.read_csv('../rand-data-1.csv')
df2 = pl.read_csv('../rand-data-2.csv')
df = pl.concat([df1, df2], how='diagonal')
X = df[['Y']].to_numpy()
X = sm.tools.add_constant(X)
y = df['X'].to_numpy()

In [14]:
glm_model = sm.GLM(y, X, family=family.Gaussian())

In [15]:
glm_result = glm_model.fit()

In [16]:
glm_result.params

array([ 0.00190917, -0.12199378])

In [17]:
glm_result.deviance

197274.8675091323

In [4]:
beta = np.random.randn(3) # +1 for intercept

In [20]:
scale = glm_model.fit().scale
normalized_cov_params = np.linalg.inv(X.T.dot(X))

In [None]:
scale

0.8633559370405476

In [21]:
glm_model = sm.GLM(y, X, family=family.Gaussian())

In [27]:
scale = None

In [31]:
glm_results = GLMResults(glm_model, beta, normalized_cov_params=normalized_cov_params, scale=scale)

In [30]:
eta = glm_results.predict(which='linear')

scale = glm_model.estimate_scale(eta)

In [25]:
scale

7.329039023691067

In [32]:
glm_results.conf_int()

array([[ 0.13428462,  0.16784324],
       [-2.49771194, -2.46415748],
       [ 0.16704232,  0.20063577]])

In [None]:
a