In [1]:
DATASET_NAME = 'gse129705'

import matplotlib.pyplot as plt
import numpy as np

from group_lasso import LogisticGroupLasso


seed_value = 314159
np.random.seed(seed_value)

LogisticGroupLasso.LOG_LOSSES = True

In [2]:
import os
import math
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import lmdb
import pickle
from torch_geometric.data import Batch
import torch
import time
import torch.nn as nn
from torch_geometric.nn import GATConv, GCNConv, TAGConv, knn
from torch.nn import Linear, Dropout, Softmax, LeakyReLU
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.nn import global_mean_pool
from hashlib import sha1
from sklearn.tree import DecisionTreeClassifier, plot_tree
from natsort import natsorted, ns

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16
from sklearn.compose import ColumnTransformer, make_column_transformer


from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier


TRAIN_PROP = 0.8



In [3]:
# Load CSV From File
raw_dataset = os.path.join(".", "data", "raw", DATASET_NAME)
patient_data = pd.read_csv(f'{raw_dataset}/Patients.csv', index_col=0)

patient_data

Unnamed: 0_level_0,RESPONSE,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.43.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.80.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.76.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.119.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C1.52.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.62.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.46.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.63.MO3,1,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912
C2.6.BL,0,3.21886,2.92638,0.0,-0.0017,-0.11964,0.0,0.0,1.21708,-0.01241,...,0.4844,0.67825,1.27458,2.56664,3.65988,0.0,2.08857,8.1522,4.08362,2.54912


In [4]:
NUM_EXAMPLES = len(patient_data.index)
NUM_TRAIN = int(TRAIN_PROP * NUM_EXAMPLES)
NUM_TEST = NUM_EXAMPLES - NUM_TRAIN
NUM_VAL = math.ceil(NUM_TRAIN * (1 - TRAIN_PROP))


NUM_VAL

21

In [5]:
MASTER_GENE_SET = set([g for g in patient_data][1:])

In [6]:
# Process Graphs.


class Graph:
    def __init__(self, name, nodes, edges, genes, mask):
        self.name = name # a string to identify this graph
        self.nodes = nodes # A dict of node names to the genes in that node.
    
        self.edges = edges # a 2 x num_edges array, where the first row is source, and the second is dest.


        self.genes = genes # a sorted list of genes included in both this graph and the provided patient csv
    
        self.mask = mask # a num_nodes * num_genes boolean array. If mask[i][j] is True, that means gene j is included in node i.

    def edge_matrix(self):
        num_edges = len(self.edges[0])

        matrix = numpy.full((2, 2), False)

        for i in range(num_edges):
            src = self.edges[0][i]
            dst = self.edges[1][i]

            matrix[dst][src] = True

        return matrix


# returns a 2 x num_edges array.
def get_edges(graph_file):
    num_edges = int(graph_file.readline())


    cur_edges = []

    for _ in range(num_edges):
        line = graph_file.readline()

        parts = line.split(",")

        cur_edges.append((int(parts[0]), int(parts[1])))

    src, dst = zip(*cur_edges)

    src, dst = list(src), list(dst)

    cur_edges = [src, dst]

    return cur_edges


# cur_nodes is a dict, genes is a sorted list.
def get_nodes_and_genes(graph_file):
    num_nodes = int(graph_file.readline())

    cur_nodes = dict()

    genes = set()
    # process the genes, get the gene map, and the list of genes.
    for _ in range(num_nodes):
        line = graph_file.readline().strip()
        parts = line.split(",")

        cur_genes = parts[1:]

        cur_genes = [g for g in cur_genes if g in MASTER_GENE_SET]

        if ("" not in cur_genes):
            cur_nodes[parts[0]] = cur_genes
    
            genes.update(cur_genes)

    genes = list(genes)
    genes = natsorted(genes)



    return cur_nodes, genes


# a num_nodes x num_genes array. If mask[i][j] is true, that means node i contains gene j.
def get_mask(cur_nodes, genes):
    genes = list(genes)
    genes = natsorted(genes)

    node_list = [n for n in cur_nodes.keys()]
    node_list =  natsorted(node_list)


    mask = []

    for node in node_list:
        cur_genes = cur_nodes[node]

        mask.append([True if g in cur_genes else False for g in genes])


    mask = np.array(mask)

    return mask

In [7]:
graph_dataset = os.path.join(raw_dataset, "Graphs")

graph_files = os.listdir(graph_dataset)
graph_files = natsorted(graph_files)

graph_list = []


# will have as many 0's as there are nodes in the first graph, as many 1's as nodes in the second, etc.

# essentially, lasso_groups[i] tells you which graph the [i'th] col of the dataframe we're making belongs to.

for i, gl in enumerate(graph_files):
    graph_file = os.path.join(graph_dataset, gl)

    graph_file = open(graph_file)

    cur_edges = get_edges(graph_file)

    cur_nodes, genes = get_nodes_and_genes(graph_file)

    mask = get_mask(cur_nodes, genes)

    
    graph_list.append(Graph(gl, cur_nodes, cur_edges, genes, mask))


In [8]:
# making the giant dataframe for group_lasso

lasso_data = np.array(patient_data['RESPONSE'])

lasso_data = np.expand_dims(lasso_data, axis=1)

col_names = ["label"]
lasso_groups = []

for i, graph in enumerate(graph_list):
    genes = graph.genes
    name = graph.name
    lasso_groups.extend([i] * len(genes))

    col_names.extend([f"{name}_{gene}" for gene in genes])
    lasso_data = np.hstack((lasso_data, np.array(patient_data[graph.genes])))

In [9]:
df = pd.DataFrame(lasso_data, patient_data.index, col_names) 

df

Unnamed: 0_level_0,label,AGE-RAGE signaling pathway in diabetic complications.txt_AGER,AGE-RAGE signaling pathway in diabetic complications.txt_AGT,AGE-RAGE signaling pathway in diabetic complications.txt_AGTR1,AGE-RAGE signaling pathway in diabetic complications.txt_AKT3,AGE-RAGE signaling pathway in diabetic complications.txt_BAX,AGE-RAGE signaling pathway in diabetic complications.txt_BCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CASP3,AGE-RAGE signaling pathway in diabetic complications.txt_CCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CCND1,...,p53 signaling pathway.txt_STEAP3,p53 signaling pathway.txt_TEP1,p53 signaling pathway.txt_THBS1,p53 signaling pathway.txt_TNFRSF10B,p53 signaling pathway.txt_TP53,p53 signaling pathway.txt_TP53AIP1,p53 signaling pathway.txt_TP73,p53 signaling pathway.txt_TSC2,p53 signaling pathway.txt_ZMAT3,p53 signaling pathway.txt_ZNF385A
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.43.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.80.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.76.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.119.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.52.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.62.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.46.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.63.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.6.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296


In [10]:
shuffled_df = df.sample(frac=1, random_state=seed_value)


train_df = shuffled_df[0:NUM_TRAIN]
test_df = shuffled_df[NUM_TRAIN:]

train_df

Unnamed: 0_level_0,label,AGE-RAGE signaling pathway in diabetic complications.txt_AGER,AGE-RAGE signaling pathway in diabetic complications.txt_AGT,AGE-RAGE signaling pathway in diabetic complications.txt_AGTR1,AGE-RAGE signaling pathway in diabetic complications.txt_AKT3,AGE-RAGE signaling pathway in diabetic complications.txt_BAX,AGE-RAGE signaling pathway in diabetic complications.txt_BCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CASP3,AGE-RAGE signaling pathway in diabetic complications.txt_CCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CCND1,...,p53 signaling pathway.txt_STEAP3,p53 signaling pathway.txt_TEP1,p53 signaling pathway.txt_THBS1,p53 signaling pathway.txt_TNFRSF10B,p53 signaling pathway.txt_TP53,p53 signaling pathway.txt_TP53AIP1,p53 signaling pathway.txt_TP73,p53 signaling pathway.txt_TSC2,p53 signaling pathway.txt_ZMAT3,p53 signaling pathway.txt_ZNF385A
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.36.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.71.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.42.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.28.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.55.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1.52.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.6.MO3,1.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.23.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.94.BL,0.0,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296


In [11]:
train_y = train_df['label']

train_X = train_df.copy()

train_X = train_df.drop('label', axis=1)


train_y

PATIENTS
C1.36.MO3    1.0
C1.71.MO3    1.0
C1.42.BL     0.0
C2.28.MO3    1.0
C1.55.BL     0.0
            ... 
C1.52.BL     0.0
C2.6.MO3     1.0
C1.23.BL     0.0
C2.94.BL     0.0
C2.6.BL      0.0
Name: label, Length: 102, dtype: float64

In [12]:
test_y = test_df['label']

test_X = test_df.copy()

test_X = test_df.drop('label', axis=1)

test_X

Unnamed: 0_level_0,AGE-RAGE signaling pathway in diabetic complications.txt_AGER,AGE-RAGE signaling pathway in diabetic complications.txt_AGT,AGE-RAGE signaling pathway in diabetic complications.txt_AGTR1,AGE-RAGE signaling pathway in diabetic complications.txt_AKT3,AGE-RAGE signaling pathway in diabetic complications.txt_BAX,AGE-RAGE signaling pathway in diabetic complications.txt_BCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CASP3,AGE-RAGE signaling pathway in diabetic complications.txt_CCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CCND1,AGE-RAGE signaling pathway in diabetic complications.txt_COL1A1,...,p53 signaling pathway.txt_STEAP3,p53 signaling pathway.txt_TEP1,p53 signaling pathway.txt_THBS1,p53 signaling pathway.txt_TNFRSF10B,p53 signaling pathway.txt_TP53,p53 signaling pathway.txt_TP53AIP1,p53 signaling pathway.txt_TP73,p53 signaling pathway.txt_TSC2,p53 signaling pathway.txt_ZMAT3,p53 signaling pathway.txt_ZNF385A
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.12.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.6.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.96.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.61.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.100.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.21.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.98.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.50.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.26.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.12.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296


In [13]:
groups = lasso_groups

In [14]:
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(train_X, train_y)



0,1,2
,groups,"[0, 0, ...]"
,group_reg,0.0
,l1_reg,0
,n_iter,100
,tol,1e-05
,scale_reg,'inverse_group_size'
,subsampling_scheme,1
,fit_intercept,True
,random_state,
,warm_start,False


In [15]:
# Extract info from estimator
pred_c = gl.predict(train_X)
sparsity_mask = gl.sparsity_mask_
w_hat = gl.coef_

# Compute performance metrics
accuracy = (pred_c == train_y).mean()

# Print results: We are hoping to overfit at this point.
print(f"Number variables: {len(sparsity_mask)}")
print(f"Number of chosen variables: {sparsity_mask.sum()}")
print(f"Accuracy: {accuracy}")

Number variables: 11167
Number of chosen variables: 7369
Accuracy: 0.5196078431372549


In [16]:
# Extract info from estimator
pred_c = gl.predict(train_X)
sparsity_mask = gl.sparsity_mask_
w_hat = gl.coef_

# Compute performance metrics
accuracy = (pred_c == train_y).mean()

# Print results: We are hoping to overfit at this point.
print(f"Number variables: {len(sparsity_mask)}")
print(f"Number of chosen variables: {sparsity_mask.sum()}")
print(f"Accuracy: {accuracy}")

Number variables: 11167
Number of chosen variables: 7369
Accuracy: 0.5196078431372549


In [17]:
def cross_validation(X_train, y_train, i):
    if(i == 4):
        X_val = X_train[- NUM_VAL:]
        y_val = y_train[- NUM_VAL:]
        
        X_train = X_train[0:-NUM_VAL]
        y_train = y_train[0:-NUM_VAL]
    else:
        X_val = X_train[i * NUM_VAL:(i + 1) * NUM_VAL]
        y_val = y_train[i * NUM_VAL:(i + 1) * NUM_VAL]
        
        X_train = np.concatenate((X_train[:i * NUM_VAL], X_train[(i + 1) * NUM_VAL:]))
        y_train = np.concatenate((y_train[:i * NUM_VAL], y_train[(i + 1) * NUM_VAL:]))


    return X_train, y_train, X_val, y_val

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score

def predict(X, true_y, gl):
    # Extract info from estimator
    pred_y = gl.predict(X)
    sparsity_mask = gl.sparsity_mask_
    w_hat = gl.coef_
    
    # Compute performance metrics
    accuracy = (pred_y == true_y).mean()
    
    # Print results: We are hoping to overfit at this point.
    return accuracy

def predict_auc(X, true_y, gl):
    pred_y = gl.predict_proba(X)[:,1]
    auc = roc_auc_score(true_y, pred_y)

    return auc

In [19]:
original_X_train = train_X
orig_y_train = train_y


X_train, y_train, X_val, y_val = cross_validation(original_X_train, orig_y_train, 0)

gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

X_train



array([[5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       ...,
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ],
       [5.99755, 0.     , 0.     , ..., 5.05085, 4.23904, 7.5296 ]],
      shape=(81, 11167))

In [20]:
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.00,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(X_train, y_train)

0,1,2
,groups,"[0, 0, ...]"
,group_reg,0.0
,l1_reg,0
,n_iter,100
,tol,1e-05
,scale_reg,'inverse_group_size'
,subsampling_scheme,1
,fit_intercept,True
,random_state,
,warm_start,False


In [21]:
print("Train Acc: {}".format(predict_auc(X_train, y_train, gl)))
print("Val Acc: {}".format(predict_auc(X_val, y_val, gl)))

Train Acc: 0.5
Val Acc: 0.5


In [22]:
import warnings
warnings.filterwarnings('ignore')


group_reg = [0.2  * i for i in range(6)]
l1_reg = [0.2  * i for i in range(6)]
train_accuracies = dict()
val_accuracies = dict()


for gr in group_reg:
    for lreg in l1_reg:
        print("\nGroup Reg: {}\n L1 Reg: {}".format(gr, lreg))
        cur_val_acc = []
        cur_train_acc = []
        for k in range(5):
            print(k)
            X_train, y_train, X_val, y_val = cross_validation(original_X_train, orig_y_train, k)

            gl = LogisticGroupLasso(
                groups=groups,
                group_reg=gr,
                l1_reg=lreg,
                scale_reg="inverse_group_size",
                subsampling_scheme=1,
                supress_warning=True,
            )

            gl.fit(X_train, y_train)
            cur_train_acc.append(predict_auc(X_train, y_train, gl))
            cur_val_acc.append(predict_auc(X_val, y_val, gl))
        total_train_acc = sum(cur_train_acc) / len(cur_val_acc)
        total_val_acc = sum(cur_val_acc) / len(cur_val_acc)
        
        train_accuracies[(gr, lreg)] = total_train_acc
        val_accuracies[(gr, lreg)] = total_val_acc
        print("Train ACC: {}".format(total_train_acc))
        print("Val ACC: {}".format(total_val_acc))



0.0: 0.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.0: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.2: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.2
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.4
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.6000000000000001
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 0.8
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.4: 1.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.6000000000000001: 0.0
0
1
2
3
4
Train ACC: 0.5
Val ACC: 0.5
0.6000000000000001: 0.2
0
1
2
3
4
Train ACC: 0.

In [23]:
full_X = pd.concat((original_X_train, test_X))
full_y = pd.concat((orig_y_train, test_y))

NUM_TEST = 23

full_X

Unnamed: 0_level_0,AGE-RAGE signaling pathway in diabetic complications.txt_AGER,AGE-RAGE signaling pathway in diabetic complications.txt_AGT,AGE-RAGE signaling pathway in diabetic complications.txt_AGTR1,AGE-RAGE signaling pathway in diabetic complications.txt_AKT3,AGE-RAGE signaling pathway in diabetic complications.txt_BAX,AGE-RAGE signaling pathway in diabetic complications.txt_BCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CASP3,AGE-RAGE signaling pathway in diabetic complications.txt_CCL2,AGE-RAGE signaling pathway in diabetic complications.txt_CCND1,AGE-RAGE signaling pathway in diabetic complications.txt_COL1A1,...,p53 signaling pathway.txt_STEAP3,p53 signaling pathway.txt_TEP1,p53 signaling pathway.txt_THBS1,p53 signaling pathway.txt_TNFRSF10B,p53 signaling pathway.txt_TP53,p53 signaling pathway.txt_TP53AIP1,p53 signaling pathway.txt_TP73,p53 signaling pathway.txt_TSC2,p53 signaling pathway.txt_ZMAT3,p53 signaling pathway.txt_ZNF385A
PATIENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1.36.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.71.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.42.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.28.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.55.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2.2.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.28.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C2.9.BL,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296
C1.100.MO3,5.99755,0.0,0.0,2.73219,6.74621,3.70124,5.07138,-0.00871,-0.00447,0.0,...,1.09594,5.31689,4.76415,6.19625,4.43874,0.0,0.0,5.05085,4.23904,7.5296


In [24]:
def cross_validation_test(i):
    if(i == 4):
        X_test = full_X[- NUM_TEST:]
        y_test = full_y[- NUM_TEST:]
        
        X_train = full_X[0:-NUM_TEST]
        y_train = full_y[0:-NUM_TEST]
    else:
        X_test = full_X[i * NUM_TEST:(i + 1) * NUM_TEST]
        y_test = full_y[i * NUM_TEST:(i + 1) * NUM_TEST]
        
        X_train = np.concatenate((full_X[:i * NUM_TEST], full_X[(i + 1) * NUM_TEST:]))
        y_train = np.concatenate((full_y[:i * NUM_TEST], full_y[(i + 1) * NUM_TEST:]))


    return X_train, y_train, X_test, y_test

In [25]:
X_train, y_train, X_test, y_test = cross_validation_test(4)
gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.4,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

gl.fit(X_train, y_train)
accuracy = predict_auc(X_test, y_test, gl)
accuracy

0.5

In [26]:
chosen = [i.item() for i in gl.chosen_groups_]
chosen.sort()

chosen

[]

In [27]:
a = [13,
 26,
 56,
 94,
 96,
 108,
 127,
 138,
 147,
 150,
 159,
 180,
 184,
 199,
 205,
 216,
 219,
 235,
 238,
 272,
 273,
 278,
 304]

both = [num for num in a if num in chosen]

In [28]:
len(both)

0

In [29]:
len(a)

23

In [30]:
len(gl.chosen_groups_)

0

In [31]:
#Stopped running here

In [32]:
best_gr = 0.2
best_lreg = 0

gl = LogisticGroupLasso(
    groups=groups,
    group_reg=0.2,
    l1_reg=0,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
)

accs = []

for i in range(5):
    X_train, y_train, X_test, y_test = cross_validation_test(i)
    gl = LogisticGroupLasso(
        groups=groups,
        group_reg=0.2,
        l1_reg=0,
        scale_reg="inverse_group_size",
        subsampling_scheme=1,
        supress_warning=True,
    )

    gl.fit(X_train, y_train)
    accuracy = predict_auc(X_test, y_test, gl)
    print("ACC {}: {}".format(i, accuracy))
    accs.append(accuracy)

print(sum(accs) / len(accs))

ACC 0: 0.5
ACC 1: 0.5
ACC 2: 0.5
ACC 3: 0.5
ACC 4: 0.5
0.5


In [33]:
0.8260869565217391 * 23

19.0