In [1]:
#Bibliotecas

import torch 
import pandas as pd 
import torch.nn as nn 
from torch.utils.data import random_split, DataLoader, TensorDataset 
import torch.nn.functional as F 
import numpy as np 
import torch.optim as optim 
from torch.optim import Adam
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

In [2]:
#Importação dos dados

civic = pd.read_csv(r'civic')


In [3]:
y = civic[['evidence_type']]

X = civic.drop(['entrez_id', 'evidence_type', 'gene_id', 'evidence_level', 'citation_id', 'source_type', 'citation', 'evidence_id', 'representative_transcript',
                    'ensembl_version', 'allele_registry_id'], axis=1).apply(lambda x: pd.factorize(x)[0])

In [4]:
#Variaveis de treino e teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

(y_train.value_counts()/y_train.shape[0])*100

evidence_type
Predictive       62.873563
Predisposing     16.513410
Prognostic       13.103448
Diagnostic        4.980843
Oncogenic         1.724138
Functional        0.804598
dtype: float64

In [5]:
y_train.value_counts()

evidence_type
Predictive       1641
Predisposing      431
Prognostic        342
Diagnostic        130
Oncogenic          45
Functional         21
dtype: int64

In [6]:
(y_test.value_counts()/y_test.shape[0])*100

evidence_type
Predictive       62.937063
Predisposing     16.472416
Prognostic       13.131313
Diagnostic        4.972805
Oncogenic         1.709402
Functional        0.777001
dtype: float64

In [7]:
#Balanceando os dados

rus = RandomUnderSampler(random_state=42)

X_res, y_res = rus.fit_resample(X_train, y_train)

y_res.value_counts()

evidence_type
Prognostic       21
Predisposing     21
Predictive       21
Oncogenic        21
Functional       21
Diagnostic       21
dtype: int64

In [8]:
#Transformando as colunas em números

labels = {'Predictive':0, 'Predisposing':1, 'Prognostic':2, 'Diagnostic':3, 'Oncogenic':4, 'Functional':5}

In [9]:
#Cria a coluna evidence_type 

y['evidence_type_num'] = y['evidence_type']

#Convertendos os valores em números

y.evidence_type_num = [labels[item] for item in y.evidence_type_num]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['evidence_type_num'] = y['evidence_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
#Define as entradas e saidas do DataSet

input = civic.iloc[:, 1:-2]         
print('Input values are: ') 
print(input.head())   
output = y.loc[:, 'evidence_type_num']         
print('Output value is: ') 
print(output.head()) 
  

Input values are: 
   entrez_id variant                         disease    doid phenotypes drugs  \
0       3717   V617F               Lymphoid Leukemia  1037.0        NaN   NaN   
1       5156   D842V  Gastrointestinal Stromal Tumor  9253.0        NaN   NaN   
2       1788    R882          Acute Myeloid Leukemia  9119.0        NaN   NaN   
3       1788    R882          Acute Myeloid Leukemia  9119.0        NaN   NaN   
4       3717   V617F        Chronic Myeloid Leukemia  8552.0        NaN   NaN   

  drug_interaction_type evidence_type evidence_direction evidence_level  ...  \
0                   NaN    Diagnostic           Supports              B  ...   
1                   NaN    Diagnostic           Supports              B  ...   
2                   NaN    Diagnostic           Supports              B  ...   
3                   NaN    Diagnostic           Supports              B  ...   
4                   NaN    Diagnostic           Supports              B  ...   

  ensembl_ver

In [11]:
#Convertendo entradas e saidas em tensores e criando o DatasetTensor

input = torch.Tensor(X.to_numpy())      
print('\nInput format: ', input.shape, input.dtype)      
output = torch.tensor(output.to_numpy())         
print('Output format: ', output.shape, output.dtype)   
data = TensorDataset(input, output) 


Input format:  torch.Size([3897, 28]) torch.float32
Output format:  torch.Size([3897]) torch.int64


In [12]:
#Dividir para treinar, validar e testar conjuntos usando random_split

train_batch_size = 10        
number_rows = len(input)    
test_split = int(number_rows*0.3)  
validate_split = int(number_rows*0.2) 
train_split = number_rows - test_split - validate_split     
train_set, validate_set, test_set = random_split( 
    data, [train_split, validate_split, test_split])

In [13]:
#Criando Dataloader para ler dados em tamanhos de lote e colocá-los na memória

train_loader = DataLoader(train_set, batch_size = train_batch_size, shuffle = True) 
validate_loader = DataLoader(validate_set, batch_size = 1) 
test_loader = DataLoader(test_set, batch_size = 1)

In [14]:
#Define parâmetros do modelo

input_size = list(input.shape)[1]    
learning_rate = 0.01 
output_size = len(labels)

In [15]:
#Definindo a rede neural

class Network(nn.Module): 
   def __init__(self, input_size, output_size): 
       super(Network, self).__init__() 
        
       self.layer1 = nn.Linear(input_size, 24) 
       self.layer2 = nn.Linear(24, 24) 
       self.layer3 = nn.Linear(24, output_size) 


   def forward(self, x): 
       x1 = F.relu(self.layer1(x)) 
       x2 = F.relu(self.layer2(x1)) 
       x3 = self.layer3(x2) 
       return x3 

In [16]:
#Instanciar o modelo

model = Network(input_size, output_size)

In [17]:
#Define o dispositivo de execução

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
print("The model will be running on", device, "device\n") 

#Converte parâmetros do modelo e buffers para CPU ou Cuda

model.to(device)

The model will be running on cpu device



Network(
  (layer1): Linear(in_features=28, out_features=24, bias=True)
  (layer2): Linear(in_features=24, out_features=24, bias=True)
  (layer3): Linear(in_features=24, out_features=6, bias=True)
)

In [18]:
#Função para salvar o modelo

def saveModel(): 
    path = "./Resultado-do-Civic-Modelo.pth" 
    torch.save(model.state_dict(), path)

In [19]:
#Define a função de perda com perda de entropia cruzada de classificação e um otimizador com Adam optimizer

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [24]:
#Função de treinamento

def train(num_epochs): 
    best_accuracy = 0.0 
     
    print("Begin training...") 
    for epoch in range(1, num_epochs+1): 
        running_train_loss = 0.0 
        running_accuracy = 0.0 
        running_vall_loss = 0.0 
        total = 0 
        

In [26]:
#Loop de treinamento

for data in train_loader: 
        
            inputs, outputs = data   
            optimizer.zero_grad()            
            predicted_outputs = model(inputs)   
            train_loss = loss_fn(predicted_outputs, outputs)   
            train_loss.backward()    
            optimizer.step()
            running_train_loss = 0.0 
            running_train_loss +=train_loss.item()  

In [32]:
#Calcular o valor da perda de treinamento

train_loss_value = running_train_loss/len(train_loader)

In [71]:
#Validação do loop

with torch.no_grad(): 
    model.eval() 
    for data in validate_loader: 
        inputs, outputs = data 
        predicted_outputs = model(inputs) 
        val_loss = loss_fn(predicted_outputs, outputs)

#Previsão com o maior rotulo

_, predicted = torch.max(predicted_outputs, 1)
running_vall_loss = 0.0 
running_vall_loss += val_loss.item()
total = 0
total += outputs.size(0)
running_accuracy = 0.0
running_accuracy += (predicted == outputs).sum().item() 

#Validação do valor de perdaa

val_loss_value = running_vall_loss/len(validate_loader) 

#Calculo da acuracia com o numero correto de previsões

accuracy = (100 * running_accuracy / total)

#Salva o modelo caso a acuracia seja a melhor

best_accuracy = 0.0
if accuracy > best_accuracy: 
    saveModel() 
    best_accuracy = accuracy
    
#Mostra as estatísticas da época

#print('Completed training batch', epoch, 'Training Loss is: %.4f' %train_loss_value, 'Validation Loss is: %.4f' %val_loss_value, 'Accuracy is %d %%' % (accuracy))

In [66]:
#Função para testar o modelo

def test():
    #Carrega o modelo que salvamos no final do loop de treinamento
    model = Network(input_size, output_size) 
    path = "NetModel.pth" 
    model.load_state_dict(torch.load(path)) 
     
    running_accuracy = 0 
    total = 0 
    
with torch.no_grad(): 
    for data in test_loader: 
        inputs, outputs = data 
        outputs = outputs.to(torch.float32) 
        predicted_outputs = model(inputs) 
        _, predicted = torch.max(predicted_outputs, 1) 
        total += outputs.size(0) 
        running_accuracy += (predicted == outputs).sum().item()
        
print('Accuracy of the model based on the test set of', test_split ,'inputs is: %d %%' % (100 * running_accuracy / total))

Accuracy of the model based on the test set of 1169 inputs is: 76 %
