In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import pandas as pd
import numpy as np
import random
import collections
import csv
from sklearn.model_selection import train_test_split, KFold
from torch.optim import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_ancestors_dict(path_to_ancestors, path_to_weights=None):
    """_summary_

    Args:
        path_to_ancestors (string): path to the ancestors file
        path_to_weights (string, optional): path to the weight file. Defaults to None, uniform.

    Returns:
        dict: ancestors dict d[MESH][ancestors] for the ancestors list and d[MESH][proba] for the associated probabilities
    """

    ancestors_dict = collections.defaultdict(dict)

    # From ancestors csv file to dict
    ancestors = pd.read_csv(path_to_ancestors)
    pre_ancestors_dict = ancestors.groupby('MESH')['MESH_ANCESTOR'].apply(list).to_dict()

    # Create weight dict
    weights = dict()
    if path_to_weights:
        weights = pd.read_csv(path_to_weights).groupby("MESH")["TOTAL_PMID_MESH"].apply(int).to_dict()

    # browse ancestors and insert weights
    for mesh, ancestors in pre_ancestors_dict.items():

        # Add the MeSH itselft to the list of ancestors
        ancestors_dict[mesh]["ancestors"] = ancestors
        inverse_weights = [1/(weights.get(ancestor, 0) + 1) for ancestor in ancestors]
        ancestors_dict[mesh]["proba"] = [w/sum(inverse_weights) for w in inverse_weights]

    return ancestors_dict

In [3]:
ancestors_dict = create_ancestors_dict("data/mesh_ancestors.csv", "data/mesh_pmids_count.csv")
print(ancestors_dict["D010084"])

{'ancestors': ['D010084', 'D004734', 'D008660', 'D055598'], 'proba': [0.5175741161665298, 0.3766489956427783, 0.06788972626217674, 0.037887161928515004]}


In [None]:
def create_samples(n, data, w, ancestors):
    """
    n(int): nombre d'exemples à générer
    data(fichier format CSV): les pmids avec les mesh annotés
    w(int): taille de la fenêtre, nombre de meshs pour le contexte
    ancestors(dico {mesh:poids}): ancêtres du mesh dans arbre
    """
    
    context = []
    target = []
    
    df = pd.read_csv(data)
    pmids = [id for id in df["PMID"].unique()]
    if n > len(pmids):
        print("Warning : n > number of pmids in data")
        n = len(pmids)

    i = 0
    while i < n:
        pmid = random.choice(pmids)
        mesh = df[df["PMID"] == pmid]["MESH"].to_list()

        # target = 1 mesh au hasard
        mesh_target = mesh.pop(random.randrange(len(mesh)))
        mesh_target = random.choices(ancestors[mesh_target]["ancestors"], weights=ancestors[mesh_target]["proba"], k=1)
        target.append(mesh_target)

        # contexte = w mesh au hasard
        if w > len(mesh):
            # supprimer
            target.pop()
        else:
            mesh_context = [mesh.pop(random.randrange(len(mesh))) for j in range(w)]
            mesh_context = [random.choices(ancestors[m]["ancestors"], weights=ancestors[m]["proba"], k=1)[0] for m in mesh_context]
            context.append(mesh_context)
            i += 1
            
    with open("data/context.csv", "w") as f:
        write = csv.writer(f)
        write.writerows(context)
    with open("data/target.csv", "w") as f:
        write = csv.writer(f)
        write.writerows(target)

In [None]:
create_samples(1000, "data/sample_pmid_mesh.csv", 6, ancestors_dict)

In [None]:
class MyDataset(Dataset):
    def __init__(self, context, target, vocab):
        """
        context(liste de liste): mesh du contexte
        target(liste): mesh cible
        vocab(dico {mesh:indice_du_1}) : indice du 1 pour le one shot vector qui représente le mesh
        """
        self.context = context
        self.target = target
        self.vocab = vocab
        
    def __len__(self):
        # taille vocab
        return len(self.target)
        
    def __getitem__(self, idx):
        if idx >= self.__len__():
            print("Error : index out of bounds")
            return None
            
        vector_target = np.zeros(len(self.vocab))
        vector_target[self.vocab[self.target[idx][0]]] = 1
        
        vector_context = np.zeros(len(self.vocab))
        for mesh in self.context[idx]:
            vector_context[self.vocab[mesh]] += 1
            
        return vector_context, vector_target
        

In [21]:

class MeSH2VecDataset(Dataset):
    """
    Create a Dataset for MeSH2VecDataset

    Args:
        f_input (str): path to the input file. A comma separated csv file where each line correspond to a context
        f_target (str): path to the target file. A csv file where each line correspond to target MeSH. Must be the same number of lines as in the context file.
        f_vocab (str): path to the vocabulary file. A csv file, one line by MeSH. The line number will five the id of the MeSH in the vocab.
        f_labels (str): path to the labels file. A two columns csv file, with MeSH identifier in the first column and the corresponding label in the second 

    Return: a list of two tensors: (tensor of length w corresponding to the context input ids, tensor of length 1 corresponding to the id of the target MeSH)
    """


    def __init__(self, f_input, f_target, f_vocab, f_labels):
        self.input = self.read(f_input).tolist()
        self.target = self.read(f_target).squeeze().tolist()
        self.vocab = self.read_vocab(f_vocab)
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
        self.vocab_size = len(self.vocab)
        self.labels = self.mesh2label_dict(f_labels)
        self.n = len(self.input)
        self.w = len(self.input[0])

    def __len__(self):
        return self.n

    def __getitem__(self, index):
        x = torch.tensor([self.vocab[m] for m in self.input[index]], dtype=torch.int32)
        y = torch.tensor(self.vocab[self.target[index]], dtype=torch.long)
        return x, y

    def read(self, path):
        f = pd.read_csv(path, header=None, dtype=str)
        return f.values

    def read_vocab(self, f_vocab):
        v = self.read(f_vocab).squeeze().tolist()

        # Check that all input are in vocab:
        flat_input = list(np.concatenate(self.input).flat)
        if not all([j in v for j in flat_input]):
            print("[WARNING] Some MeSH in input are not in the vocabulary")
        
        if not all([j in v for j in self.target]):
            print("[WARNING] Some MeSH in target are not in the vocabulary")
        
        return dict(zip(v, range(len(v))))

    def mesh2label_dict(self, f_labels):
        labels = self.read(f_labels)
        return dict(zip(labels[:, 0].tolist(), labels[:, 1].tolist()))
    
    def tensor2labels(self, t):
        return [self.labels.get(self.inverse_vocab.get(int(i), "NaN"), "No label Found") for i in t]


In [22]:
dataset = MeSH2VecDataset(f_input="data/weighted-samping/context.csv", f_target="data/weighted-samping/target.csv", f_vocab="data/mesh_vocab.csv", f_labels="data/mesh_labels.csv")

In [26]:
train, test = train_test_split(dataset, train_size=0.9, test_size=0.1, random_state=42)
print("Train : "+str(len(train)))
print("Test : "+str(len(test)))

kfold = KFold(5, shuffle=True, random_state=42)
for n, (train_i, test_i) in enumerate(kfold.split(train)):
    print("Folder "+str(n))
    print("\tLen train : "+str(len(train_i)))
    print("\tLen test : "+str(len(test_i)))
    
    train_sampler = SubsetRandomSampler(train_i)
    train_loader = DataLoader(train, 100, sampler=train_sampler)
    for i, j in enumerate(train_loader):
        print("\tTrain DataLoader "+str(i))
        print("\t\t["+str(j[0].shape)+"  "+str(j[1].shape)+"]")
        
    test_sampler = SubsetRandomSampler(test_i)
    test_loader = DataLoader(train, 100, sampler=test_sampler)
    for i, j in enumerate(test_loader):
        print("\tTest DataLoader "+str(i))
        print("\t\t["+str(j[0].shape)+"  "+str(j[1].shape)+"]")

Train : 900
Test : 100
Folder 0
	Len train : 720
	Len test : 180
	Train DataLoader 0
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 1
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 2
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 3
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 4
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 5
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 6
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 7
		[torch.Size([20, 6])  torch.Size([20])]
	Test DataLoader 0
		[torch.Size([100, 6])  torch.Size([100])]
	Test DataLoader 1
		[torch.Size([80, 6])  torch.Size([80])]
Folder 1
	Len train : 720
	Len test : 180
	Train DataLoader 0
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 1
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 2
		[torch.Size([100, 6])  torch.Size([100])]
	Train DataLoader 3
		[torch.Size([100, 6])  torch.Size([100])]
	Tr

In [42]:
class NeuralNetwork(nn.Module):
    def __init__(self, n, v):
        """
        w(int): taille fenêtre
        n(int): taille embedding
        v(int): taille vocab
        """
        super(NeuralNetwork, self).__init__()
        self.n = n
        self.v = v
        self.hidden1 = nn.Embedding(self.v, self.n, max_norm = True)
        self.hidden2 = nn.Linear(self.n, self.v, bias = False)

    def forward(self, input):
        x = self.hidden1(input)
        x = torch.mean(x, axis = 1)
        x = self.hidden2(x)
        return x
        

In [43]:
vocab_size = dataset.vocab_size
model = NeuralNetwork(50, vocab_size)


In [45]:
input, target = next(iter(dataloader))
x = model(input)

In [46]:
print(x.shape)

torch.Size([100, 12761])


In [44]:
def training_loop(model, dataset_loader, validation_loader, optimizer, loss_fn, nepochs, device):
    
    train_errors = []
    validation_errors = []

    #TODO init validation error

    for i in range(nepochs):
        print("Start epoch " + str(i + 1))

        # training mode
        model.train()

        # Get data
        for step, batch in enumerate(dataset_loader):
            print("batch number: " + str(step + 1))
            input = batch[0].to(device)
            target = batch[1].to(device)
            
            # Reinitiliyze grad
            model.zero_grad()

            # get model output for batch
            out = model(input)
            print(out.shape)
            # Compute loss
            loss = loss_fn(out, target)

            # Update batch loss and total loss
            #TODO save batch loss at very n steps
            batch_loss = loss.item()
            print(batch_loss)

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()

            #TODO validation dataset



In [45]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

In [46]:
set_seed(1024)

# Split train / test
train, test = train_test_split(dataset, train_size=0.9, test_size=0.1, random_state=42)

# KFolding
kfold = KFold(5, shuffle=True, random_state=42)
train_index, validation_index = next(iter(kfold.split(train)))

train_sampler = SubsetRandomSampler(train_index)
dataset_loader = DataLoader(train, 128, sampler=train_sampler)

validation_sampler = SubsetRandomSampler(validation_index)
validation_loader = DataLoader(train, 128, sampler=validation_sampler)

# Set optimizer
optimizer = AdamW(model.parameters(),
    lr=1e-2,    # Default learning rate
    eps=1e-8    # Default epsilon value
    )

loss_fn = nn.CrossEntropyLoss()
nepochs = 10
DEVICE = None

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    DEVICE = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
training_loop(model, dataset_loader, validation_loader, optimizer, loss_fn, nepochs, DEVICE)