## Graph Data Generation (Done on Google Collab)
### Load Packages (Mout Driver At First)
### Define Data Path (Path is based on the google collab drive path, not local fs path)
### Compress Train Data With AAE
### Cluster Train Representation Data With KMeans Cluster
### Generate Complete Graph Data
### Map Train Data With Complete Graph Data For Final Training Data (train.tfr for final model training)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###1.Load Packages

In [1]:
"""Install NSL package"""
!pip install --upgrade neural_structured_learning

Collecting neural_structured_learning
[?25l  Downloading https://files.pythonhosted.org/packages/8a/23/179e6b7555000de51d9a317e9e47db84cda0180c941cfbf14775925af611/neural_structured_learning-1.3.1-py2.py3-none-any.whl (120kB)
[K     |██▊                             | 10kB 25.1MB/s eta 0:00:01[K     |█████▍                          | 20kB 31.5MB/s eta 0:00:01[K     |████████▏                       | 30kB 18.2MB/s eta 0:00:01[K     |██████████▉                     | 40kB 21.2MB/s eta 0:00:01[K     |█████████████▋                  | 51kB 24.2MB/s eta 0:00:01[K     |████████████████▎               | 61kB 27.1MB/s eta 0:00:01[K     |███████████████████             | 71kB 18.8MB/s eta 0:00:01[K     |█████████████████████▊          | 81kB 19.4MB/s eta 0:00:01[K     |████████████████████████▌       | 92kB 18.6MB/s eta 0:00:01[K     |███████████████████████████▏    | 102kB 18.8MB/s eta 0:00:01[K     |██████████████████████████████  | 112kB 18.8MB/s eta 0:00:01[K     |██

In [3]:
from python_files.graph_data_processing import GraphDataProcess
from python_files.AAE_model import AAE
from python_files.nsl_data_processing import GenerateTrainTestDict, NSLDataFormat
from python_files.Kmeans import KMeans, KMeansModels

ModuleNotFoundError: ignored

###2.Define Data Path

In [None]:
''' label definition: NonDemented - 0, VeryMildDemented - 1, MildDemented -2, ModerateDemented -3 '''
label_list = ['NonDemented', 'VeryMildDemented', 'MildDemented', 'ModerateDemented']
root_path = './project_dataset/graph_images/'
train_root_path = f'{root_path}train/'
test_root_path = f'{root_path}test/'

train_path_list = [f'{train_root_path}{label}/' for label in label_list]
train_tfr_list = [f'{train_root_path}{label}.tfr' for label in label_list]

test_path_list = [f'{test_root_path}{label}/' for label in label_list]
test_tfr_list = [f'{test_root_path}{label}.tfr' for label in label_list]

###3.Compress TRAIN data with AAE

In [None]:
'''Generate AAE represents'''
latent_dim = 128
learning_rate = 0.0001
channels = 3
real_mean = 0
real_std = 1
batch = 1
size=(100, 100)
epochs= 400
checkpoint_path = '/content/drive/MyDrive/AD Expriment II/AAE_model_checkpoints/aae_model_checkpoints'
image_dataset = GraphDataProcess.parse_tfr_to_image_tensor(path_list=train_tfr_list, 
                                                           batch_size=batch, 
                                                           size= size, 
                                                           channels= channels, 
                                                           shuffle=True)
aae = AAE(latent_dim=latent_dim, 
          image_channels=channels,
          aae_optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=True))
# aae.load_weights(checkpoint_path)           # NOTE: checkpoints are too big to uploaded to the git repo, so this line could be commented out in the first run

In [None]:
# aae.fit(image_dataset=image_dataset,
#         checkpoints_path=checkpoint_path,
#         batch_size=1,
#         real_mean=real_mean,
#         real_std=real_std,
#         epochs=epochs,)

In [None]:
'''Plot real images vs reconstructed images'''
tfr_list = [f'{root_path}{label}.tfr' for label in label_list]
image_dataset = GraphDataProcess.parse_tfr_to_image_tensor(path_list=train_tfr_list, 
                                                           batch_size=10, 
                                                           size=(100,100),
                                                           channels=3,
                                                           shuffle=True)
data = iter(image_dataset).get_next()
plt.figure(figsize=(40,50))
for img_index, img_tensor in enumerate(data['image_tensor']):
    plt.subplot(1, 10, img_index + 1)
    plt.imshow(img_tensor)
    plt.title(label_list[data['label'].numpy()[img_index]])

In [None]:
plt.figure(figsize=(40,50))
for img_index, img_tensor in enumerate(data['image_tensor']):
    img_tensor = tf.expand_dims(img_tensor, axis=0)
    _m, _l, rep = aae.encoder(img_tensor)
    img_new = aae.decoder(rep)
    img_new = tf.squeeze(img_new, axis=0)
    plt.subplot(1, 10, img_index + 1)
    plt.imshow(tf.abs(img_new))
    plt.title('AAE-'+label_list[data['label'].numpy()[img_index]])

In [None]:
'''generate TRAIN image represents tfr files'''
img_data_list = [ GraphDataProcess.parse_tfr_to_image_tensor(path_list=[tfr_path], batch_size=1, size=(100,100), channels=3, shuffle=False) for tfr_path in tfr_list]
print(*img_data_list,sep='\n')
# GraphDataProcess.generate_tfr_aae_represent(image_dataset_list=img_data_list, aae_model=aae, tfr_rep_path=tfr_rep_list)         # NOTE: AAE rep tfr files are uploaded to git repo, so could be comment out 

###4.Cluster AAE Represents

In [None]:
"""generate K_means models for each label data"""
batch= 5000
rep_dim= 128
K_value_list = [10,10, 10, 10] # for both train and test data
epoch= 50
Kmeans_model_list = KMeansModels.generate_model_list(path_list = tfr_rep_list, rep_dim=rep_dim, K_list=K_value_list, epoch=epoch, batch=batch)

###5.Generate TRAIN Graph data based on clustering

In [None]:
'''generate the complete graph with clustering for TRAIN data only'''
threshold = 0.90
complete_graph = GraphDataProcess.generate_complete_graph_with_cluster_kmeans(tfr_rep_path_list= tfr_rep_list,
                                                                              prefix_list= [0,1,2,3], 
                                                                              model_list= Kmeans_model_list, 
                                                                              represent_dim= 128, 
                                                                              file_output_path= f'{root_path}AD_graph_AAE_KMeans.tsv',
                                                                              similarity_threshold=threshold)
complete_graph

In [None]:
"""count seed node number for each label"""
seed_count={}
for key in complete_graph.keys():
    split_content = key.split('_')[0]
    if 'c' not in split_content: 
        index=int(split_content)
        if index in [0,1,2,3] and (index in seed_count):
            seed_count[index] +=1
        elif index in [0,1,2,3] and (index not in seed_count):
            seed_count[index] =1
print(seed_count)

###6.Generate NSL compatible training data (train.tfr)

In [None]:
'''Generate the TRAIN dictionary with shuffled MRI images'''
train_examples, _ = GenerateTrainTestDict.get_train_test_dict(path_list=path_list, train_percentage=1)
print('train examples: ', len(train_examples.keys()))

In [None]:
'''generate "train_data.tfr" by merging TRAIN examples with the complete AAE graph'''
# nsl_train= NSLDataFormat(seed_dict_examples= train_examples,
#                          rem_dict_examples= _, 
#                          graph= nsl.tools.read_tsv_graph(f'{root_path}AD_graph_AAE_KMeans.tsv'), 
#                          max_nbrs= 5)
# nsl_train.generate_node_nbrs_tfr(output_file_path=train_tfr_path)              #NOTE: train_data.tfr is uploaded to git repo, so this cell could be commented out

In [None]:
'''parse train_graph_examples with clusters'''
parsed_image_size=(100,100)
parsed_image_channel=3
batch=1
max_seed_node_neighbours=5
train_image_dataset = NSLDataFormat.parse_tfr_to_dataset(file_path_list=[train_tfr_path],
                                                   batch_size=batch,
                                                   max_neighbor_number= max_seed_node_neighbours,
                                                   image_size=parsed_image_size,
                                                   image_channels=parsed_image_channel,
                                                   shuffle=True)
data_sample = iter(train_image_dataset).get_next()

'''plot parsed train_graph examples with clusters'''
label =  tf.argmax(data_sample[1], axis=0)
sample = data_sample[0]
img_index=0
plt.figure(figsize=(30,10))
for key, value in sample.items():
    if key.split('_')[-1]=='tensor':
        plt.subplot(1,max_seed_node_neighbours+1,img_index + 1)
        img_tensor = tf.reshape(value, shape=value.shape[1:])
        plt.imshow(img_tensor)
        plt.title(f'{label_list[label.numpy()[0]]}:{key}')
        img_index +=1