In [1]:
DATASET_NAME = 'gse129705'

import matplotlib.pyplot as plt
import numpy as np

from group_lasso import LogisticGroupLasso

np.random.seed(0)
LogisticGroupLasso.LOG_LOSSES = True

In [2]:
import os
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import lmdb
import pickle
from torch_geometric.data import Batch
import torch
import time
import torch.nn as nn
from torch_geometric.nn import GATConv, GCNConv, TAGConv, knn
from torch.nn import Linear, Dropout, Softmax, LeakyReLU
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.nn import global_mean_pool
from hashlib import sha1
from sklearn.tree import DecisionTreeClassifier, plot_tree
from natsort import natsorted, ns

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16
from sklearn.compose import ColumnTransformer, make_column_transformer


from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier

NUM_PEOPLE = 128
NUM_TRAIN = 105
NUM_VAL = 21




In [3]:
# Load CSV From File
raw_dataset = os.path.join(".", "data", "raw", DATASET_NAME)
patient_data = pd.read_csv(f'{raw_dataset}/Patients.csv', index_col=0)

patient_data

Unnamed: 0_level_0,RESPONSE,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.43.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.80.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.76.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.119.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.52.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.62.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.46.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.63.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.6.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912


In [4]:
MASTER_GENE_SET = set([g for g in patient_data][1:])

In [5]:
# Process Graphs.


class Graph:
    def __init__(self, name, nodes, edges, genes, mask):
        self.nodes = nodes # A dict of node names to the genes in that node.
    
        self.edges = edges # a 2 x num_edges array, where the first row is source, and the second is dest.


        self.genes = genes # a sorted list of genes included in both this graph and the provided patient csv
    
        self.mask = mask # a num_nodes * num_genes boolean array. If mask[i][j] is True, that means gene j is included in node i.

    def edge_matrix(self):
        num_edges = len(self.edges[0])

        matrix = numpy.full((2, 2), False)

        for i in range(num_edges):
            src = self.edges[0][i]
            dst = self.edges[1][i]

            matrix[dst][src] = True

        return matrix


# returns a 2 x num_edges array.
def get_edges(graph_file):
    num_edges = int(graph_file.readline())


    cur_edges = []

    for _ in range(num_edges):
        line = graph_file.readline()

        parts = line.split(",")

        cur_edges.append((int(parts[0]), int(parts[1])))

    src, dst = zip(*cur_edges)

    src, dst = list(src), list(dst)

    cur_edges = [src, dst]

    return cur_edges


# cur_nodes is a dict, genes is a sorted list.
def get_nodes_and_genes(graph_file):
    num_nodes = int(graph_file.readline())

    cur_nodes = dict()

    genes = set()
    # process the genes, get the gene map, and the list of genes.
    for _ in range(num_nodes):
        line = graph_file.readline().strip()
        parts = line.split(",")

        cur_genes = parts[1:]

        cur_genes = [g for g in cur_genes if g in MASTER_GENE_SET]

        if ("" not in cur_genes):
            cur_nodes[parts[0]] = cur_genes
    
            genes.update(cur_genes)

    genes = list(genes)
    genes = natsorted(genes)



    return cur_nodes, genes


# a num_nodes x num_genes array. If mask[i][j] is true, that means node i contains gene j.
def get_mask(cur_nodes, genes):
    genes = list(genes)
    genes = natsorted(genes)

    node_list = [n for n in cur_nodes.keys()]
    node_list =  natsorted(node_list)


    mask = []

    for node in node_list:
        cur_genes = cur_nodes[node]

        mask.append([True if g in cur_genes else False for g in genes])


    mask = np.array(mask)

    return mask

In [6]:
graph_dataset = os.path.join(raw_dataset, "Graphs")

graph_files = os.listdir(graph_dataset)
graph_files = natsorted(graph_files)

graph_list = []

for gl in graph_files:
    graph_file = os.path.join(graph_dataset, gl)

    graph_file = open(graph_file)

    cur_edges = get_edges(graph_file)

    cur_nodes, genes = get_nodes_and_genes(graph_file)

    mask = get_mask(cur_nodes, genes)
    
    graph_list.append(Graph(gl, cur_nodes, cur_edges, genes, mask))


In [16]:
# making the giant dataframe for group_lasso

lasso_data = np.empty((128, 0))

for graph in graph_list:
    lasso_data = np.hstack((lasso_data, np.array(patient_data[graph.genes])))


lasso_data

array([[5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       ...,
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ]],
      shape=(128, 11167))

In [18]:
patient_data.index

Index(['C1.43.MO3', 'C1.80.BL', 'C1.76.BL', 'C1.119.BL', 'C1.52.BL',
       'C1.99.BL', 'C1.101.MO3', 'C1.63.MO3', 'C1.94.MO3', 'C1.119.MO3',
       ...
       'C2.6.MO3', 'C2.33.BL', 'C2.64.BL', 'C2.94.BL', 'C2.46.BL', 'C2.62.BL',
       'C2.46.MO3', 'C2.63.MO3', 'C2.6.BL', 'C2.43.BL'],
      dtype='object', name='PATIENTS', length=128)

In [3]:
df = pd.read_csv("PatientsxGenes.tsv", index_col=0, sep='\t')


df

Unnamed: 0,graph_0_gene_0,graph_0_gene_1,graph_0_gene_2,graph_0_gene_3,graph_0_gene_4,graph_0_gene_5,graph_0_gene_6,graph_0_gene_7,graph_0_gene_8,graph_0_gene_9,...,graph_314_gene_52,graph_314_gene_53,graph_314_gene_54,graph_314_gene_55,graph_314_gene_56,graph_314_gene_57,graph_314_gene_58,graph_314_gene_59,graph_314_gene_60,label
C2.61.BL,5.099864,0.0,6.631111,6.189033,6.374778,0.0,9.581270,4.681807,2.932213,4.617128,...,0.0,4.868588,3.312680,8.227940,5.678660,5.729981,5.851383,0.0,3.043727,1.0
C2.76.MO3,5.250945,0.0,7.368198,5.845266,6.063288,0.0,8.874326,4.540639,4.612912,5.808935,...,0.0,6.727248,3.445305,8.503600,5.676274,4.646769,5.508851,0.0,4.402722,0.0
C2.86.MO3,4.862027,0.0,6.925854,5.804134,5.675796,0.0,9.130763,4.962864,3.793411,5.060705,...,0.0,5.402557,3.992325,8.347508,5.653762,5.217144,6.067571,0.0,4.090609,0.0
C1.52.MO3,5.099594,0.0,6.386469,5.653543,5.210064,0.0,8.892744,6.084492,4.226317,4.657531,...,0.0,5.519244,3.883992,7.782738,6.011998,3.485211,4.921434,0.0,2.619233,0.0
C2.22.MO3,5.238145,0.0,6.965542,5.809111,5.820199,0.0,9.237351,5.158874,4.495957,5.365636,...,0.0,5.171172,3.671807,7.967413,6.011729,4.655333,5.395483,0.0,3.830226,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.77.MO3,5.386681,0.0,6.440572,6.548325,6.124759,0.0,9.856809,3.535698,2.723816,4.212888,...,0.0,3.481955,3.576822,8.095545,6.027754,4.960769,6.588034,0.0,2.180776,0.0
C1.2.MO3,4.087862,0.0,5.592802,5.377042,4.658430,0.0,8.535875,4.646392,2.630166,3.881644,...,0.0,4.872023,3.305615,7.243700,4.460968,5.029951,5.305479,0.0,1.989442,0.0
C2.43.BL,3.858615,0.0,5.156918,5.137450,4.152762,0.0,8.487134,4.321683,1.363345,3.236162,...,0.0,3.640362,1.733165,6.973764,4.166258,4.891548,5.325630,0.0,1.613504,1.0
C2.121.BL,4.990077,0.0,6.211300,5.937491,5.824173,0.0,9.709085,3.708828,2.517222,4.124257,...,0.0,5.083029,3.093564,8.206971,5.460528,4.970910,5.895818,0.0,2.591436,1.0


In [4]:
train_df = df[0:NUM_TRAIN]
test_df = df[NUM_TRAIN:]

train_df

Unnamed: 0,graph_0_gene_0,graph_0_gene_1,graph_0_gene_2,graph_0_gene_3,graph_0_gene_4,graph_0_gene_5,graph_0_gene_6,graph_0_gene_7,graph_0_gene_8,graph_0_gene_9,...,graph_314_gene_52,graph_314_gene_53,graph_314_gene_54,graph_314_gene_55,graph_314_gene_56,graph_314_gene_57,graph_314_gene_58,graph_314_gene_59,graph_314_gene_60,label
C2.61.BL,5.099864,0.0,6.631111,6.189033,6.374778,0.0,9.581270,4.681807,2.932213,4.617128,...,0.0,4.868588,3.312680,8.227940,5.678660,5.729981,5.851383,0.0,3.043727,1.0
C2.76.MO3,5.250945,0.0,7.368198,5.845266,6.063288,0.0,8.874326,4.540639,4.612912,5.808935,...,0.0,6.727248,3.445305,8.503600,5.676274,4.646769,5.508851,0.0,4.402722,0.0
C2.86.MO3,4.862027,0.0,6.925854,5.804134,5.675796,0.0,9.130763,4.962864,3.793411,5.060705,...,0.0,5.402557,3.992325,8.347508,5.653762,5.217144,6.067571,0.0,4.090609,0.0
C1.52.MO3,5.099594,0.0,6.386469,5.653543,5.210064,0.0,8.892744,6.084492,4.226317,4.657531,...,0.0,5.519244,3.883992,7.782738,6.011998,3.485211,4.921434,0.0,2.619233,0.0
C2.22.MO3,5.238145,0.0,6.965542,5.809111,5.820199,0.0,9.237351,5.158874,4.495957,5.365636,...,0.0,5.171172,3.671807,7.967413,6.011729,4.655333,5.395483,0.0,3.830226,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1.101.MO3,4.824971,0.0,5.959500,5.194636,5.918340,0.0,9.368523,6.964259,2.628767,4.014282,...,0.0,6.453118,2.046679,8.413776,4.608255,4.957430,5.010219,0.0,2.975822,0.0
C2.171.MO3,5.259408,0.0,7.023045,5.627778,6.037689,0.0,9.118333,6.061628,3.358259,5.134460,...,0.0,4.851371,3.179759,8.480967,4.974258,5.643474,5.430232,0.0,3.311875,0.0
C2.127.BL,3.197020,0.0,5.218371,4.303904,3.716040,0.0,7.136352,6.452500,1.822342,3.376732,...,0.0,3.439244,2.907609,6.320674,3.736869,3.934664,4.196562,0.0,1.832822,1.0
C2.61.MO3,5.328716,0.0,6.461002,6.494040,6.549103,0.0,9.929786,4.170067,3.375894,4.482450,...,0.0,4.607536,3.775202,8.251563,6.189978,4.973026,5.919171,0.0,2.879740,0.0


In [5]:
train_y = train_df['label']

train_X = train_df.copy()

train_X = train_df.drop('label', axis=1)


train_y

C2.61.BL      1.0
C2.76.MO3     0.0
C2.86.MO3     0.0
C1.52.MO3     0.0
C2.22.MO3     0.0
             ... 
C1.101.MO3    0.0
C2.171.MO3    0.0
C2.127.BL     1.0
C2.61.MO3     0.0
C2.56.MO3     0.0
Name: label, Length: 105, dtype: float64

In [6]:
test_y = test_df['label']

test_X = test_df.copy()

test_X = test_df.drop('label', axis=1)

test_X

Unnamed: 0,graph_0_gene_0,graph_0_gene_1,graph_0_gene_2,graph_0_gene_3,graph_0_gene_4,graph_0_gene_5,graph_0_gene_6,graph_0_gene_7,graph_0_gene_8,graph_0_gene_9,...,graph_314_gene_51,graph_314_gene_52,graph_314_gene_53,graph_314_gene_54,graph_314_gene_55,graph_314_gene_56,graph_314_gene_57,graph_314_gene_58,graph_314_gene_59,graph_314_gene_60
C1.115.BL,4.579738,0.0,7.086056,6.170477,5.868474,0.0,8.912133,3.846007,2.723642,4.755634,...,4.190129,0.0,6.215937,3.882316,7.771184,5.548407,4.812465,6.043094,0.0,3.894197
C1.68.MO3,5.496297,0.0,6.58564,6.442819,6.515203,0.0,9.455796,4.254486,3.635362,4.85811,...,5.317296,0.0,4.306818,3.938557,7.896287,6.4567,4.353505,5.614793,0.0,3.422321
C1.55.BL,4.50178,0.0,5.62819,4.884413,4.926449,0.0,8.111503,7.01807,2.736666,4.173879,...,3.389166,0.0,3.498189,3.85463,7.27677,5.732089,4.276269,4.436545,0.0,2.0725
C2.97.MO3,4.684398,0.0,6.892597,5.684683,4.687801,0.0,8.535487,5.287411,2.904564,4.510106,...,3.804701,0.0,5.681827,3.795862,8.238235,5.058021,4.869885,5.605889,0.0,3.468707
C1.43.MO3,3.065627,0.0,7.243805,6.495665,5.437992,0.0,8.473976,4.662108,3.796077,3.721006,...,3.053024,0.0,4.836336,3.158451,7.223731,4.438744,5.454349,6.421616,0.0,1.893416
C1.45.BL,4.693142,0.0,6.953202,6.030978,5.773766,0.0,8.848165,4.350963,3.444499,4.776649,...,4.150358,0.0,6.353143,4.039446,8.470705,5.13635,4.546404,5.228992,0.0,3.76851
C1.45.MO3,4.920704,0.0,6.635975,5.802205,5.023377,0.0,8.802582,5.113038,3.304601,4.574266,...,2.833619,0.0,5.836796,3.175561,8.41408,4.42042,5.330072,5.77302,0.0,3.764148
C2.94.BL,4.267309,0.0,6.672972,5.967642,4.802401,0.0,8.579089,4.732249,3.800739,5.155296,...,3.208666,0.0,4.731055,4.079794,7.612175,5.314546,5.695622,6.413218,0.0,3.47253
C1.46.MO3,4.893523,0.0,5.483575,5.96333,5.442413,0.0,9.659391,5.175251,2.411041,3.407292,...,4.087238,0.0,4.337027,2.069919,7.476551,5.326955,5.666058,6.10425,0.0,1.336775
C2.8.MO3,5.206864,0.0,6.317996,6.14434,5.734025,0.0,9.621038,4.461648,2.253042,4.083625,...,3.787924,0.0,4.948714,3.284752,8.66993,5.456843,5.077451,6.308996,0.0,2.765316


In [7]:
group_sizes = open("num_genes.csv")

group_sizes = group_sizes.read()

group_sizes = [int(i) for i in group_sizes.split(",")]

group_sizes

groups = np.concatenate([[i]*size for i, size in enumerate(group_sizes)])

In [8]:
groups

array([  0,   0,   0, ..., 314, 314, 314], shape=(12132,))

In [9]:
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(train_X, train_y)

You used subsampling then this is expected, otherwise, try increasing the number of iterations or decreasing the tolerance.


0,1,2
,groups,"array([ 0, ...hape=(12132,))"
,group_reg,0.0
,l1_reg,0
,n_iter,100
,tol,1e-05
,scale_reg,'inverse_group_size'
,subsampling_scheme,1
,fit_intercept,True
,random_state,
,warm_start,False


In [10]:
# Extract info from estimator
pred_c = gl.predict(train_X)
sparsity_mask = gl.sparsity_mask_
w_hat = gl.coef_

# Compute performance metrics
accuracy = (pred_c == train_y).mean()

# Print results: We are hoping to overfit at this point.
print(f"Number variables: {len(sparsity_mask)}")
print(f"Number of chosen variables: {sparsity_mask.sum()}")
print(f"Accuracy: {accuracy}")

Number variables: 12132
Number of chosen variables: 8423
Accuracy: 1.0


In [11]:
# Extract info from estimator
pred_c = gl.predict(train_X)
sparsity_mask = gl.sparsity_mask_
w_hat = gl.coef_

# Compute performance metrics
accuracy = (pred_c == train_y).mean()

# Print results: We are hoping to overfit at this point.
print(f"Number variables: {len(sparsity_mask)}")
print(f"Number of chosen variables: {sparsity_mask.sum()}")
print(f"Accuracy: {accuracy}")

Number variables: 12132
Number of chosen variables: 8423
Accuracy: 1.0


In [12]:
def cross_validation(X_train, y_train, i):
    if(i == 4):
        X_val = X_train[- NUM_VAL:]
        y_val = y_train[- NUM_VAL:]
        
        X_train = X_train[0:-NUM_VAL]
        y_train = y_train[0:-NUM_VAL]
    else:
        X_val = X_train[i * NUM_VAL:(i + 1) * NUM_VAL]
        y_val = y_train[i * NUM_VAL:(i + 1) * NUM_VAL]
        
        X_train = np.concatenate((X_train[:i * NUM_VAL], X_train[(i + 1) * NUM_VAL:]))
        y_train = np.concatenate((y_train[:i * NUM_VAL], y_train[(i + 1) * NUM_VAL:]))


    return X_train, y_train, X_val, y_val

In [13]:
from sklearn.metrics import accuracy_score, roc_auc_score

def predict(X, true_y, gl):
    # Extract info from estimator
    pred_y = gl.predict(X)
    sparsity_mask = gl.sparsity_mask_
    w_hat = gl.coef_
    
    # Compute performance metrics
    accuracy = (pred_y == true_y).mean()
    
    # Print results: We are hoping to overfit at this point.
    return accuracy

def predict_auc(X, true_y, gl):
    pred_y = gl.predict_proba(X)[:,1]
    auc = roc_auc_score(true_y, pred_y)

    return auc

In [14]:
original_X_train = train_X
orig_y_train = train_y


X_train, y_train, X_val, y_val = cross_validation(original_X_train, orig_y_train, 0)

gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

X_train



array([[3.7898796, 0.       , 5.53182  , ..., 5.7420473, 0.       ,
        3.0537667],
       [5.490886 , 0.       , 5.4886785, ..., 6.152848 , 0.       ,
        0.       ],
       [5.4704638, 0.       , 7.261301 , ..., 4.9246135, 0.       ,
        4.697723 ],
       ...,
       [3.19702  , 0.       , 5.218371 , ..., 4.196562 , 0.       ,
        1.8328221],
       [5.328716 , 0.       , 6.4610023, ..., 5.919171 , 0.       ,
        2.8797402],
       [4.4044757, 0.       , 5.444377 , ..., 5.897336 , 0.       ,
        2.0483356]], shape=(84, 12132))

In [15]:
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(X_train, y_train)

You used subsampling then this is expected, otherwise, try increasing the number of iterations or decreasing the tolerance.


0,1,2
,groups,"array([ 0, ...hape=(12132,))"
,group_reg,0.0
,l1_reg,0
,n_iter,100
,tol,1e-05
,scale_reg,'inverse_group_size'
,subsampling_scheme,1
,fit_intercept,True
,random_state,
,warm_start,False


In [16]:
print("Train Acc: {}".format(predict_auc(X_train, y_train, gl)))
print("Val Acc: {}".format(predict_auc(X_val, y_val, gl)))

Train Acc: 1.0
Val Acc: 0.8272727272727273


In [17]:
import warnings
warnings.filterwarnings('ignore')


group_reg = [0.2  * i for i in range(6)]
l1_reg = [0.2  * i for i in range(6)]
train_accuracies = dict()
val_accuracies = dict()


for gr in group_reg:
    for lreg in l1_reg:
        print("{}: {}".format(gr, lreg))
        cur_val_acc = []
        cur_train_acc = []
        for k in range(5):
            print(k)
            X_train, y_train, X_val, y_val = cross_validation(original_X_train, orig_y_train, k)

            gl = LogisticGroupLasso(
                groups=groups,
                group_reg=gr,
                l1_reg=lreg,
                scale_reg="inverse_group_size",
                subsampling_scheme=1,
                supress_warning=True,
            )

            gl.fit(X_train, y_train)
            cur_train_acc.append(predict_auc(X_train, y_train, gl))
            cur_val_acc.append(predict_auc(X_val, y_val, gl))
        total_train_acc = sum(cur_train_acc) / len(cur_val_acc)
        total_val_acc = sum(cur_val_acc) / len(cur_val_acc)
        
        train_accuracies[(gr, lreg)] = total_train_acc
        val_accuracies[(gr, lreg)] = total_val_acc
        print("Train ACC: {}".format(total_train_acc))
        print("Val ACC: {}".format(total_val_acc))



0.0: 0.0
0
1
2
3
4
Train ACC: 1.0
Val ACC: 0.7507757057757057
0.0: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.0
0
1
2
3
4
Train ACC: 1.0
Val ACC: 0.7676936026936027
0.2: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.0
0
1
2
3
4
Train ACC: 0.9987462861696894
Val ACC: 0.7751800051800052
0.4: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.6000000000000001: 0.0
0
1
2
3
4
Train ACC: 0.98

In [19]:
full_X = pd.concat((original_X_train, test_X))
full_y = pd.concat((orig_y_train, test_y))

NUM_TEST = 23

full_X

Unnamed: 0,graph_0_gene_0,graph_0_gene_1,graph_0_gene_2,graph_0_gene_3,graph_0_gene_4,graph_0_gene_5,graph_0_gene_6,graph_0_gene_7,graph_0_gene_8,graph_0_gene_9,...,graph_314_gene_51,graph_314_gene_52,graph_314_gene_53,graph_314_gene_54,graph_314_gene_55,graph_314_gene_56,graph_314_gene_57,graph_314_gene_58,graph_314_gene_59,graph_314_gene_60
C2.61.BL,5.099864,0.0,6.631111,6.189033,6.374778,0.0,9.581270,4.681807,2.932213,4.617128,...,3.833158,0.0,4.868588,3.312680,8.227940,5.678660,5.729981,5.851383,0.0,3.043727
C2.76.MO3,5.250945,0.0,7.368198,5.845266,6.063288,0.0,8.874326,4.540639,4.612912,5.808935,...,4.654890,0.0,6.727248,3.445305,8.503600,5.676274,4.646769,5.508851,0.0,4.402722
C2.86.MO3,4.862027,0.0,6.925854,5.804134,5.675796,0.0,9.130763,4.962864,3.793411,5.060705,...,4.492044,0.0,5.402557,3.992325,8.347508,5.653762,5.217144,6.067571,0.0,4.090609
C1.52.MO3,5.099594,0.0,6.386469,5.653543,5.210064,0.0,8.892744,6.084492,4.226317,4.657531,...,3.669328,0.0,5.519244,3.883992,7.782738,6.011998,3.485211,4.921434,0.0,2.619233
C2.22.MO3,5.238145,0.0,6.965542,5.809111,5.820199,0.0,9.237351,5.158874,4.495957,5.365636,...,4.206443,0.0,5.171172,3.671807,7.967413,6.011729,4.655333,5.395483,0.0,3.830226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.77.MO3,5.386681,0.0,6.440572,6.548325,6.124759,0.0,9.856809,3.535698,2.723816,4.212888,...,5.207813,0.0,3.481955,3.576822,8.095545,6.027754,4.960769,6.588034,0.0,2.180776
C1.2.MO3,4.087862,0.0,5.592802,5.377042,4.658430,0.0,8.535875,4.646392,2.630166,3.881644,...,3.322396,0.0,4.872023,3.305615,7.243700,4.460968,5.029951,5.305479,0.0,1.989442
C2.43.BL,3.858615,0.0,5.156918,5.137450,4.152762,0.0,8.487134,4.321683,1.363345,3.236162,...,2.680716,0.0,3.640362,1.733165,6.973764,4.166258,4.891548,5.325630,0.0,1.613504
C2.121.BL,4.990077,0.0,6.211300,5.937491,5.824173,0.0,9.709085,3.708828,2.517222,4.124257,...,4.147527,0.0,5.083029,3.093564,8.206971,5.460528,4.970910,5.895818,0.0,2.591436


In [20]:
def cross_validation_test(i):
    if(i == 4):
        X_test = full_X[- NUM_TEST:]
        y_test = full_y[- NUM_TEST:]
        
        X_train = full_X[0:-NUM_TEST]
        y_train = full_y[0:-NUM_TEST]
    else:
        X_test = full_X[i * NUM_TEST:(i + 1) * NUM_TEST]
        y_test = full_y[i * NUM_TEST:(i + 1) * NUM_TEST]
        
        X_train = np.concatenate((full_X[:i * NUM_TEST], full_X[(i + 1) * NUM_TEST:]))
        y_train = np.concatenate((full_y[:i * NUM_TEST], full_y[(i + 1) * NUM_TEST:]))


    return X_train, y_train, X_test, y_test

In [21]:
X_train, y_train, X_test, y_test = cross_validation_test(4)
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.4,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(X_train, y_train)
accuracy = predict_auc(X_test, y_test, gl)
accuracy

You used subsampling then this is expected, otherwise, try increasing the number of iterations or decreasing the tolerance.


0.8560606060606061

In [30]:
chosen = [i.item() for i in gl.chosen_groups_]
chosen.sort()

chosen

[16,
 24,
 89,
 92,
 94,
 95,
 96,
 97,
 99,
 101,
 105,
 106,
 107,
 108,
 109,
 110,
 114,
 119,
 120,
 121,
 123,
 128,
 129,
 133,
 137,
 140,
 141,
 145,
 147,
 150,
 151,
 153,
 155,
 156,
 159,
 161,
 162,
 166,
 177,
 189,
 214,
 216,
 233,
 234,
 235,
 239,
 247,
 248,
 249,
 250,
 252,
 253,
 254,
 257,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 273,
 277,
 278,
 279,
 313,
 314]

In [32]:
a = [13,
 26,
 56,
 94,
 96,
 108,
 127,
 138,
 147,
 150,
 159,
 180,
 184,
 199,
 205,
 216,
 219,
 235,
 238,
 272,
 273,
 278,
 304]

both = [num for num in a if num in chosen]

In [33]:
len(both)

11

In [34]:
len(a)

23

In [36]:
len(gl.chosen_groups_)

73

In [None]:
#Stopped running here

In [53]:
best_gr = 0.2
best_lreg = 0

gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.2,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

accs = []

for i in range(5):
    X_train, y_train, X_test, y_test = cross_validation_test(i)
    gl = LogisticGroupLasso(
        groups=groups,
        group_reg=0.2,
        l1_reg=0,
        scale_reg="inverse_group_size",
        subsampling_scheme=1,
        supress_warning=True,
    )

    gl.fit(X_train, y_train)
    accuracy = predict_auc(X_test, y_test, gl)
    print("ACC {}: {}".format(i, accuracy))
    accs.append(accuracy)

print(sum(accs) / len(accs))

ACC 0: 0.782608695652174
ACC 1: 0.7391304347826086
ACC 2: 0.8260869565217391
ACC 3: 0.6086956521739131
ACC 4: 0.8260869565217391
0.7565217391304349


In [25]:
0.8260869565217391 * 23

19.0