In [42]:
import numpy as np
import scipy.io as sio
from scipy.linalg import toeplitz
from scipy.stats import multivariate_normal
import time
import random
from scipy.linalg import block_diag
from sklearn.metrics import precision_recall_curve, auc
import scipy.stats
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import random
import scipy.sparse as sp
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pickle
from collections import defaultdict

import sys
import random
import itertools
import os
import matlab.engine
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, ElasticNetCV
from scipy.sparse import csgraph


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [57]:
class Solver:    
    def __init__(self, models, options=None,                  
                 datafile=f'./code_fgfl_aaai14/data_gfl/',
                 resultfile='./code_fgfl_aaai14/result_gfl/',):   
        self.res = defaultdict(list)
        self.models = models
        self.n = None
        self.d = None
        self.k = None
        self.options = options if options else {'maxIter': 500, 'verbose': 0, 'SPGiters': 100}
        self.datafile = os.path.join(os.path.abspath(datafile), f'real_data')
        self.resultfile = os.path.join(os.path.abspath(resultfile), f'real_data') # matlab does not like relative path
        self.best_rho1 = 0.5
        self.best_rho2 = 0.5
        self.best_mu = 1.0
        self.best_rho = None
        self.datafile_pqn = os.path.join(os.path.abspath('./PQN/data/'), f'real_data')
        self.resultfile_pqn = os.path.join(os.path.abspath('./PQN/result/'), f'real_data')
        self._init(self.datafile, self.resultfile)
        self._init(self.datafile_pqn, self.resultfile_pqn)


    def _init(self, datafile, resultfile):
        if not os.path.exists(datafile):
            os.makedirs(datafile)
        if not os.path.exists(resultfile):
            os.makedirs(resultfile)

    def _solver_lasso_sklearn(self, X, y):
        """
        Use sklearn's Lasso implementation to solve the Lasso problem.
        """
        alpha = 0.1
        lasso_model = Lasso(alpha=alpha, max_iter=10000)  # Lasso model with high max_iter
        lasso_model.fit(X, y)  # Fit the model
        u = lasso_model.coef_  # Get the coefficients
        # print(f"Lasso coefficients: {u}")
        return u 

    def _solver_proximal(self, X, y, A, i):
        datafile_name = os.path.join(self.datafile, f'data_{i}.mat')
        resultfile_name = os.path.join(self.resultfile, f'result_{i}.mat')
        self._save_mat(X, y, A, datafile_name)
        self._call_proximal(datafile_name, resultfile_name, self.best_rho1, self.best_rho2)
        u, funcVal = self._read_result(resultfile_name)
        return u.flatten() # the original return a vector with shape (d,1), will not work with recovery_accuracy

    def _solver_gfl(self, X, y, L, i, rho=None, mu=0.01, k=None):
        datafile_pqn = os.path.join(self.datafile_pqn, f'data_{i}.mat')
        resultfile_pqn = os.path.join(self.resultfile_pqn, f'result_{i}.mat')
        self._save_mat_pqn(X, y, L, datafile_pqn)
        # if rho is None or mu is None: # we don't need to store the data when we do Lasso
            # self._save_mat_pqn(X, y, L, datafile_pqn)
        if rho is None or mu is None:
            if i == 0:
                rho_values = [np.sqrt(self.n), 6.8 * np.sqrt(self.n)]
                mu_values = [0.01, 0.1, 1.0]
                self.best_rho, self.best_mu = self._cross_validation_gfl(X, y, L, rho_values, mu_values, k=k)
                print(f"Best rho: {self.best_rho}, Best mu: {self.best_mu}")
            rho = self.best_rho
            mu = self.best_mu

        self._call_gfl(datafile_pqn, resultfile_pqn, rho, mu, k)
        u, _ = self._read_result(resultfile_pqn)
        return u.flatten()
    
    def _save_mat_pqn(self, X, y, L, filename=None):
        # save the data to .mat file so that the matlab code of proxiaml can use it
        # print("X.shape", X.shape)   
        # print("y.shape", y.shape)
        # print("A.shape", A.shape)
        if y.ndim == 1:
            y = y[:, np.newaxis]
        data = {
            "X": X,
            "y": y,
            "L": L.toarray() if sp.issparse(L) else L,  # we store the adjacency matrix as dense matrix
        }
        sio.savemat(filename, data)

    def _read_result(self, resultfile):
        result = sio.loadmat(resultfile)
        beta, funcVal = result['beta'], result['funcVal']
        return beta, funcVal
        

    def _call_gfl(self, datafile, resultfile, rho, mu, k=None):
        eng = matlab.engine.start_matlab()
        try:
            eng.cd(os.path.abspath('./PQN/'))
            eng.addpath(os.path.abspath('./PQN/'))
            eng.addpath(eng.genpath(os.path.abspath('./PQN/')))
            eng.addpath(eng.genpath(os.path.abspath('./PQN/minConF/')))
            if k:
                eng.gfl_pqn(datafile, resultfile, rho, mu, float(k), nargout=0)
            else:
                eng.gfl_pqn(datafile, resultfile, rho, mu, float(self.k), nargout=0)
        finally:
            eng.quit()
    
    def _save_result(self, u, filename):
        sio.savemat(filename, {'beta': u})

    def _call_proximal(self, datafile, resultfile, rho1, rho2):
        eng = matlab.engine.start_matlab()
        try:
            eng.cd(os.path.abspath('./code_fgfl_aaai14/'))
            eng.addpath(os.path.abspath('./code_fgfl_aaai14/GFL/'))
            eng.addpath(eng.genpath(os.path.abspath('./code_fgfl_aaai14/')))
            eng.gfl_proximal(datafile, resultfile, rho1, rho2, nargout=0)
        finally:
            eng.quit()

    def _solver_aGrace(self, X, y, W, lambda1=1.0, lambda2=1.0, max_iter=1000, tol=1e-4):
        # Standardize X and center y
        n, p = X.shape
        X_mean = X.mean(axis=0)
        X_std = X.std(axis=0)
        X_std[X_std == 0] = 1  # avoid division by zero
        X = (X - X_mean) / X_std
        y_mean = y.mean()
        y = y - y_mean

        # Compute initial estimate beta_tilde
        if p < n:
            lr = LinearRegression(fit_intercept=False)
            lr.fit(X, y)
            beta_tilde = lr.coef_
        else:
            enet = ElasticNetCV(l1_ratio=0.5, fit_intercept=False, cv=5, max_iter=10000)
            enet.fit(X, y)
            beta_tilde = enet.coef_

        # Construct modified Laplacian matrix Lstar
        d = W.sum(axis=1).A1 if hasattr(W, 'A1') else W.sum(axis=1)  # handle sparse matrices
        Lstar = np.zeros((p, p))
        rows, cols = W.nonzero()
        for i in range(len(rows)):
            u, v = rows[i], cols[i]
            if u >= v:
                continue  # process each edge once
            if d[u] == 0 or d[v] == 0:
                Lstar[u, v] = Lstar[v, u] = 0
            else:
                sign_u = np.sign(beta_tilde[u]) if beta_tilde[u] != 0 else 0
                sign_v = np.sign(beta_tilde[v]) if beta_tilde[v] != 0 else 0
                weight = W[u, v] if isinstance(W, np.ndarray) else W.data[i]
                Lstar_uv = -sign_u * sign_v * weight / np.sqrt(d[u] * d[v])
                Lstar[u, v] = Lstar_uv
                Lstar[v, u] = Lstar_uv
        np.fill_diagonal(Lstar, 1 * (d > 0))  # set diagonal to 1 if degree > 0

        # Precompute adjacency list
        adjacency_list = [[] for _ in range(p)]
        for u, v in zip(rows, cols):
            if u != v:
                adjacency_list[u].append(v)

        # Initialize beta and residual
        beta = np.zeros(p)
        residual = y.copy()
        prev_beta = np.inf * np.ones(p)
        iter = 0

        # Coordinate descent
        while iter < max_iter and np.linalg.norm(beta - prev_beta) > tol:
            prev_beta = beta.copy()
            for u in range(p):
                xu = X[:, u]
                current_beta_u = beta[u]

                # Compute xuTr and neighbor_sum
                xuTr = xu @ residual
                xuTr_plus = xuTr + n * current_beta_u  # since xu.T @ xu = n

                neighbor_sum = 0
                for v in adjacency_list[u]:
                    neighbor_sum += Lstar[u, v] * beta[v]

                # Update beta_u
                z = (xuTr_plus - lambda2 * neighbor_sum) / (n + lambda2)
                threshold = lambda1 / (2 * (n + lambda2))
                beta_u_new = np.sign(z) * max(abs(z) - threshold, 0)

                # Update residual and beta
                delta = beta_u_new - current_beta_u
                residual -= xu * delta
                beta[u] = beta_u_new

            iter += 1

        return beta

    def _save_mat(self, X, y, A, filename=None):
        # save the data to .mat file so that the matlab code of proxiaml can use it
        # print("X.shape", X.shape)   
        # print("y.shape", y.shape)
        # print("A.shape", A.shape)
        if y.ndim == 1:
            y = y[:, np.newaxis]
        data = {
            "X": X,
            "y": y,
            "AdjMat": A.toarray() if sp.issparse(A) else A,  # we store the adjacency matrix as dense matrix
        }
        sio.savemat(filename, data)

    def solver(self, model, X, y, clusters=None, L=None, A=None, i=None, rho=1, mu=1):
        if model == "Proximal":
            return self._solver_proximal(X, y, A, i)
        elif model == "Lasso":
            rho = np.sqrt(self.n) 
            return self._solver_gfl(X, y, L, i, rho, mu=0.0)
        elif model == "GFL_Matlab":
            return self._solver_gfl(X, y, L, i, rho=np.sqrt(self.n), mu=0.01)
        elif model == "Lasso_Sklearn":
            return self._solver_lasso_sklearn(X, y)
        elif model == "Adaptive_Grace":
            if not isinstance(A, np.ndarray):
                Aa = A.toarray()
                return self._solver_aGrace(X, y, Aa)
            else:
                return self._solver_aGrace(X, y, A)
        else:
            raise ValueError("Model not supported")
    # ------------------------------------------------------------
    #  Evaluate each model on an 80 / 20 train‑test split
    # ------------------------------------------------------------
    def evaluate_out_of_sample(self, X, y, L=None, A=None, k=None,
                               test_size=0.20, random_state=0):
        """
        Fits each model on 80 % of the data and returns the test MSE
        computed on the held‑out 20 %.

        Parameters
        ----------
        X, y     : full design matrix and response
        L, A, k  : graph‑based inputs required by some solvers
        test_size, random_state : passed straight to train_test_split

        Returns
        -------
        mse_dict : {model_name: test_MSE}
        """
        # 1 train‑test split (stratification not required for regression)
        X_tr, X_te, y_tr, y_te = train_test_split(
            X, y, test_size=test_size, random_state=random_state)

        # 2 update sizes for solvers that rely on self.n, self.k
        self.n, self.d = X_tr.shape
        if k is not None:
            self.k = k

        mse_dict = {}
        for i, model in enumerate(self.models):
            print(f"[{model}] fitting on {len(y_tr)} obs …")
            u = self.solver(model, X_tr, y_tr, L=L, A=A, i=i)
            # if the mode is Lasso and GFL_Matlab, we need to recover the beta from u
            if model == "Lasso" or model == "GFL_Matlab":
                selected_features = np.argsort(np.abs(u))[-self.k:]
                X_tr_sub = X_tr[:, selected_features]
                linear_model = LinearRegression()
                linear_model.fit(X_tr_sub, y_tr)
                beta = np.zeros(self.d)
                beta[selected_features] = linear_model.coef_
            else:
                beta = u

            # 3️⃣ predict & MSE
            y_hat = X_te @ beta
            mse = mean_squared_error(y_te, y_hat)
            mse_dict[model] = mse
            print(f"[{model}] out‑of‑sample MSE = {mse:.4f}")

        return mse_dict, u


    def inference(self, X, y, L, A, k):
        self.n, self.d = X.shape
        self.k = k
        for i, model in enumerate(self.models):
            print(f"Running model {model}")
            u = self.solver(model, X, y, L=L, A=A, i=i)
            self.res[model].append(u)
        return self.res

translate the data into the format which we can feed into the model

In [1]:
import pandas as pd
from collections import Counter

In [None]:
# Load the .txt file as a tab-delimited DataFrame
df_response = pd.read_csv("./Processed_Data/Drug_Screen_predicted_percent_viability_CTRPv2.0.txt", sep=",")

In [5]:
# rows of the DataFrame
rows, columns = df_response.shape
print(f"Number of rows: {rows}, Number of columns: {columns}")

Number of rows: 462784, Number of columns: 3


In [8]:
# Load the .txt file as a tab-delimited DataFrame
df_exp = pd.read_csv("./Processed_Data/CCL_expression_meanleq2_varleq1_consistent_with_ppi_paircode_001.txt", sep=",")
rows, columns = df_exp.shape
print(f"Number of rows: {rows}, Number of columns: {columns}")

Number of rows: 1673, Number of columns: 2830


In [9]:
# drop the missing values and duplicate the rows
df_response = df_response.dropna().drop_duplicates()
df_exp = df_exp.dropna().drop_duplicates()
print(f"Number of rows: {df_response.shape[0]}, Number of columns: {df_response.shape[1]}")
print(f"Number of rows: {df_exp.shape[0]}, Number of columns: {df_exp.shape[1]}")

Number of rows: 372945, Number of columns: 3
Number of rows: 1673, Number of columns: 2830


In [11]:
df_response.head()

Unnamed: 0,ModelID,master_cpd_id,pred_pv_high_conc
0,ACH-000464,1788,0.0192
1,ACH-000464,3588,0.4412
2,ACH-000464,12877,1.1647
3,ACH-000464,19153,0.8505
4,ACH-000464,23256,0.0253


In [12]:
df_exp.head()

Unnamed: 0,ModelID,SNAPC1,CDCP1,MEST,TOM1L1,NAPRT,DDX58,DSG2,XBP1,TMEM106B,...,PYCARD,SREBF1,DIP2C,MRAS,CENPU,EOGT,SH3BP4,GABRE,NUF2,SLC27A3
0,ACH-000873,3.98075,4.104507,4.999966,3.685291,3.755174,0.630862,6.008137,5.61419,4.064907,...,4.976635,6.535609,1.58677,3.968386,3.447982,2.135584,3.394889,4.512461,3.82126,5.095368
1,ACH-000860,4.707645,5.214055,5.912647,4.750175,6.875647,2.66556,5.824374,5.607051,3.708557,...,-0.123067,7.765723,1.665909,0.479346,6.325923,2.77408,4.829941,2.722943,5.980082,3.263898
2,ACH-000439,2.361229,-0.175734,0.662084,-0.138789,5.535352,1.285239,-0.187353,7.967794,2.055993,...,3.871292,4.265387,0.004867,0.008519,3.905927,2.953637,0.041747,-0.020901,2.496649,3.389783
3,ACH-000318,7.128238,3.943137,5.754716,4.42011,6.049343,2.127028,6.667163,5.58421,4.031139,...,1.871205,7.638038,0.97283,1.331984,4.704557,1.344838,4.544907,3.76745,4.702673,3.100831
4,ACH-001142,5.057905,5.475212,5.729837,4.378693,3.511627,6.72581,5.281987,6.576746,4.386153,...,3.41983,7.058021,3.376348,3.120036,3.881749,4.49465,5.930621,3.49413,4.275999,2.698712


In [13]:
# show the number of samples for each compound
print(Counter(df_response['master_cpd_id'].tolist()))

Counter({660086: 843, 62602: 839, 411770: 832, 36599: 830, 609060: 830, 609639: 830, 37190: 829, 362338: 829, 606034: 829, 660238: 829, 52123: 827, 411833: 827, 28172: 826, 660288: 826, 44580: 825, 347813: 825, 609091: 825, 632104: 825, 660410: 825, 61097: 824, 290356: 824, 639759: 824, 660134: 824, 44511: 823, 375395: 823, 385240: 823, 660078: 823, 660206: 823, 660433: 823, 660777: 823, 44554: 821, 56703: 821, 632873: 821, 639390: 821, 347775: 820, 418038: 820, 418209: 820, 606142: 820, 660217: 820, 660245: 820, 30620: 819, 50134: 819, 375264: 819, 411717: 819, 606033: 819, 635576: 819, 640007: 819, 660081: 819, 660341: 818, 28784: 817, 55307: 817, 154846: 817, 411815: 817, 616353: 817, 616354: 817, 660821: 817, 404566: 816, 660207: 816, 52882: 815, 61674: 815, 375564: 815, 411739: 815, 50163: 814, 54210: 814, 63578: 814, 411867: 814, 606035: 814, 632103: 814, 182395: 813, 411733: 813, 411869: 813, 660252: 813, 26979: 812, 411720: 812, 411843: 812, 600054: 812, 660201: 812, 660321: 81

In [14]:
# now we need to segment the dataframe df_response according to the master_cpd_id which have more than 800 samples, and we need a dictionary to store all such master_cpd_id
cpd_ids = df_response['master_cpd_id'].unique()
print(f"Number of unique master_cpd_id: {len(cpd_ids)}")

Number of unique master_cpd_id: 545


In [15]:
# cpd_ids of interest, i.e. has more than 800 samples
cpd_ids_of_interest = [cpd_id for cpd_id in cpd_ids if len(df_response[df_response['master_cpd_id'] == cpd_id]) >= 800]
print(f"Number of unique master_cpd_id with more than 800 samples: {len(cpd_ids_of_interest)}")


Number of unique master_cpd_id with more than 800 samples: 167


In [18]:
# take sample cpd_ids_of_interest[0]
cpd_id = cpd_ids_of_interest[0]
df_cpd_sample = df_response[df_response['master_cpd_id'] == cpd_id]
print(f"Number of samples: {df_cpd_sample.shape[0]}, Number of columns: {df_cpd_sample.shape[1]}")
df_sample.head()

Number of samples: 811, Number of columns: 3


Unnamed: 0,ModelID,master_cpd_id,pred_pv_high_conc
2,ACH-000464,12877,1.1647
425,ACH-000028,12877,1.0641
870,ACH-000351,12877,1.0802
1320,ACH-000026,12877,0.6944
1697,ACH-000802,12877,1.0817


In [19]:
def get_data(cpd_id):
    # get the data for the given cpd_id
    df_cpd_sample = df_response[df_response['master_cpd_id'] == cpd_id]
    df_cpd_sample = df_cpd_sample.drop(columns=['master_cpd_id'])
    merge_df = pd.merge(df_cpd_sample, df_exp, on='ModelID', how='inner')
    return merge_df

In [25]:
merged_df = get_data(cpd_id)
merged_df = merged_df.drop(columns=['ModelID'])
print(f"Number of samples: {merged_df.shape[0]}, Number of columns: {merged_df.shape[1]}")
merged_df.head()

Number of samples: 795, Number of columns: 2830


Unnamed: 0,pred_pv_high_conc,SNAPC1,CDCP1,MEST,TOM1L1,NAPRT,DDX58,DSG2,XBP1,TMEM106B,...,PYCARD,SREBF1,DIP2C,MRAS,CENPU,EOGT,SH3BP4,GABRE,NUF2,SLC27A3
0,1.1647,3.694941,0.020123,4.240311,2.978083,1.910523,2.267594,3.025555,5.662627,5.624064,...,2.115358,5.946474,3.051355,3.780793,3.663726,3.843312,5.198597,3.778196,4.760291,2.852171
1,1.0641,2.393661,-0.120848,5.259663,6.025885,4.246347,2.185479,4.354055,8.65616,4.316104,...,5.921365,7.792774,3.027201,1.315479,4.783864,2.404173,4.66167,-0.086625,4.248483,5.391512
2,1.0802,3.812389,5.793513,7.085047,4.942423,4.240056,2.921996,4.493579,5.333525,4.71613,...,6.306779,7.25969,2.22045,3.895602,4.901146,2.362487,5.686152,0.029463,5.114265,3.714696
3,0.6944,3.841664,4.671314,3.539474,5.994242,3.893794,3.705787,4.255203,6.108499,5.050608,...,5.932452,6.598065,1.521983,2.052854,4.965362,3.182297,3.201513,-0.020901,5.447624,3.746298
4,1.0817,4.395912,5.119089,6.661166,4.934522,6.284833,4.800164,6.535992,6.490499,4.199905,...,3.217075,6.472507,2.128135,0.375536,4.317082,3.291064,4.848344,4.114696,4.24531,3.197742


In [None]:
import networkx as nx
from scipy import sparse          # gives an efficient sparse Laplacian
                                   # but you can `.toarray()` if you need a dense copy

def build_feature_laplacian(
        df: pd.DataFrame,
        edge_file: str,
        y_col: str = "response",   # name of the column that holds y
        sep: str = r"\s+"          # tab/space‑separated edge file; change if it’s comma‑separated
    ):
    """
    Parameters
    ----------
    df         : DataFrame with   [ y | gene_1 | gene_2 | … | gene_d ]
    edge_file  : path to TXT file whose *each line* is  "geneA geneB"
    y_col      : name of the response column
    sep        : delimiter used in the edge file
    
    Returns
    -------
    L          : scipy.sparse.csr_matrix Laplacian  (shape d×d)
    genes      : list of gene names in the order used for L  (so you can map back)
    """
    # 1  Which genes do you need a graph for?
    genes = [c for c in df.columns if c != y_col]

    # 2  Build the full PPI graph
    edges = pd.read_csv(edge_file, header=None, sep=sep, names=["g1", "g2"])
    G = nx.Graph()
    G.add_edges_from(edges.itertuples(index=False, name=None))   # unweighted; add `data=…` if you have weights

    # 3  Restrict to the genes present in the DataFrame
    #     — isolated genes (no edge in the file) are still allowed; they just get 0‑rows/cols.
    G_sub = nx.Graph()
    G_sub.add_nodes_from(genes)                # ensures every feature appears (even if disconnected)
    G_sub.add_edges_from(G.edges(genes))       # edges where *both* ends are in `genes`

    # 4  Laplacian aligned to feature order
    L = nx.laplacian_matrix(G_sub, nodelist=genes)  # csr_matrix, symmetric, shape len(genes)×len(genes)

    # Alternative: normalized version
    # L = nx.normalized_laplacian_matrix(G_sub, nodelist=genes)

    return L, genes


In [27]:
L, genes = build_feature_laplacian(df = merged_df, 
                        edge_file = "./Processed_Data/PPI_network_edgelist_consistent_with_expression_paircode_001.txt", 
                        y_col="pred_pv_high_conc", sep=r"\s+")

In [28]:
L

<2829x2829 sparse array of type '<class 'numpy.int64'>'
	with 70267 stored elements in Compressed Sparse Row format>

In [29]:
genes

['SNAPC1',
 'CDCP1',
 'MEST',
 'TOM1L1',
 'NAPRT',
 'DDX58',
 'DSG2',
 'XBP1',
 'TMEM106B',
 'ITGA5',
 'PCTP',
 'TCTEX1D2',
 'CNPY4',
 'ZNF675',
 'SEC31B',
 'NT5DC2',
 'DYNLT1',
 'CEP41',
 'FOXRED2',
 'EHHADH',
 'NAAA',
 'LITAF',
 'BTN3A3',
 'TEAD2',
 'CXXC5',
 'MYO1D',
 'PCDH1',
 'S100A1',
 'MRC2',
 'PLXNA1',
 'TGFB1',
 'HLA-E',
 'ANKRD13A',
 'POMGNT2',
 'UQCRH',
 'ETV5',
 'WASL',
 'PORCN',
 'SGCE',
 'COL5A1',
 'PDP1',
 'AGTRAP',
 'SDC3',
 'SLC20A1',
 'CLDN15',
 'L1CAM',
 'NEDD4',
 'THEM4',
 'SLC25A20',
 'CIART',
 'ZNF649',
 'TBC1D1',
 'ERMP1',
 'C3orf14',
 'PRELID2',
 'PRR5',
 'GPSM3',
 'DPYD',
 'LBH',
 'RILP',
 'SLCO4A1',
 'KIF7',
 'TWF1',
 'CTSC',
 'PLEC',
 'F2R',
 'PXN',
 'S100A2',
 'IDUA',
 'DLC1',
 'WDR54',
 'CCDC88C',
 'IQCJ-SCHIP1',
 'STARD13',
 'IER3',
 'ARRB1',
 'MEX3A',
 'NRGN',
 'RAB23',
 'SERINC5',
 'IGFBP5',
 'EVA1A',
 'IL6ST',
 'TGFB2',
 'FAM20C',
 'ECE1',
 'FAM171A1',
 'GALNT14',
 'SHISA4',
 'EAF2',
 'TRIM21',
 'TMEM38A',
 'APOL2',
 'OGFRL1',
 'ITGB4',
 'PFKM',
 'CAPN2

In [41]:
def laplacian_to_adjacency(L):
    """
    Convert a Laplacian matrix to an adjacency matrix.
    """
    # The adjacency matrix is given by A = D - L, where D is the degree matrix
    D = np.diag(np.sum(L, axis=1))
    A = D - L
    return A

def get_solution(cpd_id, models, k=10):
    # get the data for the given cpd_id
    merged_df = get_data(cpd_id)
    merged_df = merged_df.drop(columns=['ModelID'])
    X = merged_df.drop(columns=['pred_pv_high_conc']).values
    y = merged_df['pred_pv_high_conc'].values
    A = laplacian_to_adjacency(L)
    n, d = X.shape

    solver = Solver(models=models)
    u = solver.inference(X, y, L, A, k)
    return u


# models = ["Proximal", "Lasso", "GFL_Matlab", "Lasso_Sklearn"]
models = ["Lasso_Sklearn"]
res = get_solution(cpd_id, models, k=50)
print(max(res['Lasso_Sklearn'][0]))

Running model Lasso_Sklearn
0.007486699230804884


In [53]:
def get_omse(cpd_id, models, k=10):
    # get the data for the given cpd_id
    merged_df = get_data(cpd_id)
    merged_df = merged_df.drop(columns=['ModelID'])
    X = merged_df.drop(columns=['pred_pv_high_conc']).values
    y = merged_df['pred_pv_high_conc'].values
    A = laplacian_to_adjacency(L)
    n, d = X.shape

    solver = Solver(models=models)
    mse_dict, u = solver.evaluate_out_of_sample(X, y, L.astype(np.float64), A.astype(np.float64), k=k)
    return mse_dict, u

mse_dict = get_omse(cpd_id, models, k=50)

[Lasso_Sklearn] fitting on 636 obs …
[Lasso_Sklearn] out‑of‑sample MSE = 0.1935
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.0848
[Proximal] fitting on 636 obs …
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
132 134
110 112
108 110
102 104
100 102
97 99
104 106
111 113
102 104
109 111
113 115
113 115
122 124
123 125
127 129
125 127
131 133
128 130
142 144
140 142
149 151
150 152
147 149
148 150
144 146
154 156
162 164
160 162
150 152
153 155
142 144
156 158
157 159
153 155
169 171
166 168
161 163
157 159
154 156
163 165
173 175
172 174
166 168
167 169
162 164
171 173
168 170
171 173
169 171
169 171
167 169
171 173
167 169
166 168
183 185
173 175
172 174
184 186
174 176
179 181
169 171
183 185
177 179
177 179
175 177
170 172
164 166
158 160
174 176
179 181
173 175
174 176
177 179
178 180
175 177
188 190
184 186


In [54]:
mse_dict

({'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.08480202543764179,
  'Proximal': 0.052373036274105},
 array([ 5.23305170e-04, -2.32169155e-05,  5.23305170e-04, ...,
         8.56914155e-04,  1.12503030e-04,  9.14212333e-05]))

In [49]:
# we don't know k, so we need to try different k
k_values = np.arange(30, 150, 20)
models = ["Lasso_Sklearn", "GFL_Matlab", "Proximal"]
mse_dict = {}
for k in k_values:
    mse_dict[k] = get_omse(cpd_id, models, k=k)
    print(f"mse_dict[{k}]: {mse_dict[k]}")

[Lasso_Sklearn] fitting on 636 obs …
[Lasso_Sklearn] out‑of‑sample MSE = 0.1935
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.0728
[Proximal] fitting on 636 obs …
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
2812 2814
132 134
110 112
108 110
102 104
100 102
97 99
104 106
111 113
102 104
109 111
113 115
113 115
122 124
123 125
127 129
125 127
131 133
128 130
142 144
140 142
149 151
150 152
147 149
148 150
144 146
154 156
162 164
160 162
150 152
153 155
142 144
156 158
157 159
153 155
169 171
166 168
161 163
157 159
154 156
163 165
173 175
172 174
166 168
167 169
162 164
171 173
168 170
171 173
169 171
169 171
167 169
171 173
167 169
166 168
183 185
173 175
172 174
184 186
174 176
179 181
169 171
183 185
177 179
177 179
175 177
170 172
164 166
158 160
174 176
179 181
173 175
174 176
177 179
178 180
175 177
188 190
184 186


In [50]:
mse_dict

{30: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.07282625745939046,
  'Proximal': 0.052373036274105},
 50: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.07641228900240571,
  'Proximal': 0.052373036274105},
 70: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.1786328762731322,
  'Proximal': 0.052373036274105},
 90: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.06197547574945381,
  'Proximal': 0.052373036274105},
 110: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.22007185415800976,
  'Proximal': 0.052373036274105},
 130: {'Lasso_Sklearn': 0.19350406812281928,
  'GFL_Matlab': 0.10640835086002927,
  'Proximal': 0.052373036274105}}

In [60]:
def get_solution(cpd_id, models, k=30):
    # get the data for the given cpd_id
    merged_df = get_data(cpd_id)
    merged_df = merged_df.drop(columns=['ModelID'])
    X = merged_df.drop(columns=['pred_pv_high_conc']).values
    y = merged_df['pred_pv_high_conc'].values
    A = laplacian_to_adjacency(L)
    n, d = X.shape

    solver = Solver(models)
    u = solver.inference(X, y, L.astype(np.float32), 
        A.astype(np.float32), k)
    return 

def get_genes(cpd_id, k):
    models = ["GFL_Matlab"]
    u = get_solution(cpd_id, models, k=k)
    u = u['GFL_Matlab'][0]
    genes = np.array(genes)
    selected_genes = genes[np.argsort(u)[-k:]]
    return selected_genes

results = {} # (cpd_id, k) : [selected_genes]
results_with_best_k = {} # cpd_id: [selected_genes]
k_values = np.arange(20, 60, 10)
models = ["GFL_Matlab"]
for cpd_id in cpd_ids_of_interest[:10]:
    best_mse = float('inf')
    best_k = None
    for k in k_values:
        mse_dict, u = get_omse(cpd_id, models, k=k)
        mes = mse_dict['GFL_Matlab']
        if mes < best_mse:
            best_mse = mes
            best_k = k
        genes = np.array(genes)
        selected_genes = genes[np.argsort(u)[-k:]]
        results[(cpd_id, k)] = selected_genes
        print(f"cpd_id: {cpd_id}, k: {k}, mse: {mes}")
    results_with_best_k[cpd_id] = results[(cpd_id, best_k)]
    print(f"Best k for cpd_id {cpd_id} is {best_k} with mse {best_mse}")


        

[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.2021
cpd_id: 12877, k: 20, mse: 0.20210593660334933
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.0728
cpd_id: 12877, k: 30, mse: 0.07282625745939046
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.0677
cpd_id: 12877, k: 40, mse: 0.0676567641041989
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.0764
cpd_id: 12877, k: 50, mse: 0.07641228900240571
Best k for cpd_id 12877 is 40 with mse 0.0676567641041989
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.4466
cpd_id: 25344, k: 20, mse: 0.44664515817340666
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.5825
cpd_id: 25344, k: 30, mse: 0.5825027384479788
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.3726
cpd_id: 25344, k: 40, mse: 0.37261581729339743
[GFL_Matlab] fitting on 636 obs …
[GFL_Matlab] out‑of‑sample MSE = 0.3512
cpd_id: 25344, k: 50, m

In [61]:
# save the results to a file
with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)
with open('results_with_best_k.pkl', 'wb') as f:
    pickle.dump(results_with_best_k, f)