In [1]:
import pandas as pd
import numpy as np

import stellargraph as sg
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import GCNSupervisedGraphClassification
from stellargraph import StellarGraph

from stellargraph import datasets

from sklearn import model_selection
from IPython.display import display, HTML

from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import matplotlib.pyplot as plt

import os

In [2]:
sg.__version__

'1.2.1'

In [3]:
mydf = pd.read_csv("/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/Chronic Pancreatitis/318_1.csv")

In [4]:
mydf['weight'] = [abs(x) for x in mydf['weight']]

In [5]:
mydf

Unnamed: 0,source,target,weight,type
0,CD4+ Epithelial+,FoxP3+ Treg+ Tcell+,2.432959,Depletion
1,CD4+,CD4+ Epithelial+,1.870178,Depletion
2,CD4+ Epithelial+,CD4+ Tcell+,1.4725,Depletion
3,CD4+ APC+,CD4+ Epithelial+,1.28391,Depletion
4,CD4+,CD8+ Tcell+,0.936591,Depletion
5,CD4+,FoxP3+ Treg+ Tcell+,0.83996,Depletion
6,CD4+ Epithelial+,CD8+ Tcell+,0.569074,Depletion
7,CD4+ APC+,CD4+ APC+,2.400274,Enrichment
8,CD4+ Tcell+,CD4+ Tcell+,2.214389,Enrichment
9,CD4+ APC+,CD4+ Tcell+,1.563853,Enrichment


In [6]:
weighted_features = StellarGraph(edges=mydf, edge_type_column='type')

In [7]:
print(weighted_features.info())

StellarGraph: Undirected multigraph
 Nodes: 6, Edges: 14

 Node types:
  default: [6]
    Features: none
    Edge types: default-Depletion->default, default-Enrichment->default

 Edge types:
    default-Enrichment->default: [7]
        Weights: range=[0.244165, 2.40027], mean=1.22261, std=0.900037
        Features: none
    default-Depletion->default: [7]
        Weights: range=[0.569074, 2.43296], mean=1.3436, std=0.645932
        Features: none


In [8]:
file_dirs = ["/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/" + x for x in ['Chronic Pancreatitis', 'IPMN', 'MCN', 'PanIN', 'PDAC']]

In [9]:
file_dirs

['/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/Chronic Pancreatitis',
 '/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/IPMN',
 '/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/MCN',
 '/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/PanIN',
 '/Users/morganoneka/Box/My Stuff/GraphClassification/GiottoOutput_Binarized/PDAC']

In [10]:
idx = [len(os.listdir(x)) for x in file_dirs]

In [11]:
idx

[110, 176, 34, 82, 230]

In [29]:
celltypes = []

for dirname in file_dirs:
    files = os.listdir(dirname)
    csvs = [x for x in files if x.endswith(".csv")]
    for f in csvs:
        mydf = pd.read_csv(dirname + "/" + f)
#         mydf['weight'] = [abs(x) for x in mydf['weight']]
        celltypes = celltypes + (list(set(mydf['source'])))
        celltypes = celltypes + (list(set(mydf['target'])))
        celltypes = list(set(celltypes))

In [30]:
celltypes

['PD.L1+ CD4+',
 'CD8+ PD.L1+ Tcell+',
 'FoxP3+ PD.L1+ Treg+',
 'CD8+ Tcell+',
 'CD8+ FoxP3+ PD.L1+',
 'FoxP3+ PD.L1+ Epithelial+',
 'CD8+ PD.L1+ APC+',
 'CD8+ PD.L1+ Epithelial+',
 'CD8+ FoxP3+ Epithelial+',
 'FoxP3+ PD.L1+ Treg+ Tcell+',
 'PD.L1+ CD4+ Tcell+',
 'CD8+ FoxP3+ PD.L1+ Epithelial+',
 'PD.L1+ CD4+ APC+',
 'FoxP3+ Treg+ APC+',
 'CD8+ FoxP3+ PD.L1+ APC+',
 'CD8+',
 'CD4+ Tcell+',
 'FoxP3+ Treg+ Tcell+',
 'CD8+ FoxP3+ APC+',
 'CD8+ Epithelial+',
 'CD4+ Epithelial+',
 'FoxP3+ Treg+',
 'CD8+ FoxP3+ Tcell+',
 'FoxP3+',
 'FoxP3+ PD.L1+ Treg+ Epithelial+',
 'PD.L1+ CD4+ Epithelial+',
 'CD8+ APC+',
 'CD8+ FoxP3+ PD.L1+ Tcell+',
 'CD8+ FoxP3+',
 'CD8+ PD.L1+',
 'CD4+',
 'CD4+ APC+',
 'FoxP3+ Treg+ Epithelial+',
 'FoxP3+ PD.L1+ Treg+ APC+']

In [31]:
node_data = pd.DataFrame(
            {"celltype": [str(x) for x in range(len(celltypes))]}, index=celltypes
        )

In [32]:
node_data

Unnamed: 0,celltype
PD.L1+ CD4+,0
CD8+ PD.L1+ Tcell+,1
FoxP3+ PD.L1+ Treg+,2
CD8+ Tcell+,3
CD8+ FoxP3+ PD.L1+,4
FoxP3+ PD.L1+ Epithelial+,5
CD8+ PD.L1+ APC+,6
CD8+ PD.L1+ Epithelial+,7
CD8+ FoxP3+ Epithelial+,8
FoxP3+ PD.L1+ Treg+ Tcell+,9


In [33]:
def get_graphs(dirname):
    my_graphs=[]
    files = os.listdir(dirname)
    csvs = [x for x in files if x.endswith(".csv")]
    for f in csvs:
        mydf = pd.read_csv(dirname + "/" + f)
        mydf['weight'] = [abs(x) for x in mydf['weight']]
        # 1 = tumor
        # 2 = immune 
        # 3 = other
#         node_data = pd.DataFrame(
#             {"celltype": ["1", "2", "2", "2", "2", "3"]}, index=["Tumor", "APC", "Helper T-Cell", "CTL", "T-reg", "Other"]
#         )
        
        weighted_features = StellarGraph(nodes=node_data, edges=mydf, edge_type_column='type')
        my_graphs.append(weighted_features)
    return(my_graphs)

In [34]:
cp_graphs = get_graphs(file_dirs[0])
ipmn_graphs = get_graphs(file_dirs[1])
mcn_graphs = get_graphs(file_dirs[2])
panin_graphs = get_graphs(file_dirs[3])
pdac_graphs = get_graphs(file_dirs[4])

In [14]:
# def remove_problematic_graphs(grp):
#     return ([g for g in grp if g.number_of_edges() > 0])

# print(len(cp_graphs))
# print(len(remove_problematic_graphs(cp_graphs)))

# print(len(ipmn_graphs))
# print(len(remove_problematic_graphs(ipmn_graphs)))

# print(len(mcn_graphs))
# print(len(remove_problematic_graphs(mcn_graphs)))

# print(len(panin_graphs))
# print(len(remove_problematic_graphs(panin_graphs)))

# print(len(pdac_graphs))
# print(len(remove_problematic_graphs(pdac_graphs)))

In [51]:
# all_graphs = cp_graphs + ipmn_graphs + mcn_graphs + panin_graphs + pdac_graphs
all_graphs = cp_graphs + pdac_graphs

In [52]:
all_graphs[0].info()

'StellarGraph: Undirected multigraph\n Nodes: 34, Edges: 20\n\n Node types:\n  default: [34]\n    Features: float32 vector, length 1\n    Edge types: default-Depletion->default, default-Enrichment->default\n\n Edge types:\n    default-Depletion->default: [13]\n        Weights: range=[0.115187, 4.38543], mean=2.59378, std=1.40721\n        Features: none\n    default-Enrichment->default: [7]\n        Weights: range=[0.484224, 1.95889], mean=1.12074, std=0.57666\n        Features: none'

In [53]:
# labels = ["CP" for x in range(idx[0])] + ["IPMN" for x in range(idx[1])] + ["MCN" for x in range(idx[2])] + ["PanIN" for x in range(idx[3])] + ["PDAC" for x in range(idx[4])]
# labels = [0 for x in range(len(cp_graphs))] + [1 for x in range(len(ipmn_graphs))] + [2 for x in range(len(mcn_graphs))] + [3 for x in range(len(panin_graphs))] + [4 for x in range(len(pdac_graphs))]

In [54]:
# CP = non-cancer
# IPMN = pre-cancer
# MCN = pre-cancer
# PanIN = pre-cancer
# PDAC = cancer
# labels = [0 for x in range(len(cp_graphs))] + [1 for x in range(len(ipmn_graphs))] + [1 for x in range(len(mcn_graphs))] + [1 for x in range(len(panin_graphs))] + [2 for x in range(len(pdac_graphs))]
labels = [0 for x in range(len(cp_graphs))] + [1 for x in range(len(pdac_graphs))]

In [55]:
graph_labels = pd.Series(labels)
print(graph_labels)

0      0
1      0
2      0
3      0
4      0
      ..
165    1
166    1
167    1
168    1
169    1
Length: 170, dtype: int64


In [56]:
len(all_graphs)

170

In [57]:
generator = PaddedGraphGenerator(graphs=all_graphs)

In [58]:
def create_graph_classification_model(generator):
    gc_model = GCNSupervisedGraphClassification(
        layer_sizes=[64, 64],
        activations=["relu", "relu"],
        generator=generator,
        dropout=0.5,
    )
    x_inp, x_out = gc_model.in_out_tensors()
    predictions = Dense(units=32, activation="relu")(x_out)
    predictions = Dense(units=16, activation="relu")(predictions)
    predictions = Dense(units=1, activation="sigmoid")(predictions)

    # Let's create the Keras model and prepare it for training
    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(optimizer=Adam(0.005), loss=binary_crossentropy, metrics=["acc"])

    return model

In [63]:
epochs = 500  # maximum number of training epochs
folds = 10  # the number of folds for k-fold cross validation (default was 10)
n_repeats = 10  # the number of repeats for repeated k-fold cross validation (default was 5)
es = EarlyStopping(
    monitor="val_loss", min_delta=0, patience=25, restore_best_weights=True
)

In [64]:
def train_fold(model, train_gen, test_gen, es, epochs):
    history = model.fit(
        train_gen, epochs=epochs, validation_data=test_gen, verbose=0, callbacks=[es],
    )
    # calculate performance on the test data and return along with history
    test_metrics = model.evaluate(test_gen, verbose=0)
    test_acc = test_metrics[model.metrics_names.index("acc")]

    return history, test_acc

def get_generators(train_index, test_index, graph_labels, batch_size):
    train_gen = generator.flow(
        train_index, targets=graph_labels.iloc[train_index].values, batch_size=batch_size
    )
    test_gen = generator.flow(
        test_index, targets=graph_labels.iloc[test_index].values, batch_size=batch_size
    )

    return train_gen, test_gen



In [65]:
test_accs = []

stratified_folds = model_selection.RepeatedStratifiedKFold(
    n_splits=folds, n_repeats=n_repeats
).split(graph_labels, graph_labels)

for i, (train_index, test_index) in enumerate(stratified_folds):
    print(f"Training and evaluating on fold {i+1} out of {folds * n_repeats}...")
    train_gen, test_gen = get_generators(
        train_index, test_index, graph_labels, batch_size=30
    )

    model = create_graph_classification_model(generator)

    history, acc = train_fold(model, train_gen, test_gen, es, epochs)

    test_accs.append(acc)

Training and evaluating on fold 1 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 2 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 3 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 4 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 5 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 6 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 7 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and evaluating on fold 8 out of 100...
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Training and eva

In [66]:
test_accs

[0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.64705884,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
 0.7058824,
