In [1]:
#Bibliotecas

import torch 
import pandas as pd 
import torch.nn as nn 
from torch.utils.data import random_split, DataLoader, TensorDataset 
import torch.nn.functional as F 
import numpy as np 
import torch.optim as optim 
from torch.optim import Adam

In [2]:
#Importação dos dados

civic = pd.read_csv(r'civic') 

#print(civic.head()) 


In [3]:
civic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3897 entries, 0 to 3896
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gene                          3897 non-null   object 
 1   entrez_id                     3897 non-null   int64  
 2   variant                       3897 non-null   object 
 3   disease                       3874 non-null   object 
 4   doid                          3865 non-null   float64
 5   phenotypes                    508 non-null    object 
 6   drugs                         2449 non-null   object 
 7   drug_interaction_type         619 non-null    object 
 8   evidence_type                 3897 non-null   object 
 9   evidence_direction            3718 non-null   object 
 10  evidence_level                3897 non-null   object 
 11  clinical_significance         3675 non-null   object 
 12  evidence_statement            3897 non-null   object 
 13  cit

In [4]:
#Tranformando as colunas em números

civic = civic[['gene', 'entrez_id', 'variant', 'disease', 'doid', 'phenotypes', 'drugs', 'drug_interaction_type', 'evidence_type', 'evidence_direction', 'evidence_level', 'clinical_significance', 'evidence_statement', 'citation_id', 'source_type', 'citation', 'rating', 'evidence_status', 'evidence_id', 'variant_id', 'gene_id', 'chromosome', 'start', 'stop', 'reference_bases', 'variant_bases', 'representative_transcript', 'ensembl_version', 'reference_build', 'variant_summary', 'variant_origin', 'is_flagged', 'variant_types', 'hgvs_expressions', 'civic_variant_evidence_score', 'allele_registry_id', 'clinvar_ids', 'variant_aliases', 'description']].apply(lambda x: pd.factorize(x)[0])

In [5]:
#Conversão em type

label = {'gene': 0, 'disease': 1, 'clinical_significance': 2, }

#Define as entradas e saidas do DataSet

input = civic.iloc[:, 1:-2]         
print('Input values are: ') 
print(input.head())   
output = civic.loc[:, 'gene']         
print('Output value is: ') 
print(output.head()) 
  

Input values are: 
   entrez_id  variant  disease  doid  phenotypes  drugs  \
0          0        0        0     0          -1     -1   
1          1        1        1     1          -1     -1   
2          2        2        2     2          -1     -1   
3          2        2        2     2          -1     -1   
4          0        0        3     3          -1     -1   

   drug_interaction_type  evidence_type  evidence_direction  evidence_level  \
0                     -1              0                   0               0   
1                     -1              0                   0               0   
2                     -1              0                   0               0   
3                     -1              0                   0               0   
4                     -1              0                   0               0   

   ...  ensembl_version  reference_build  variant_summary  variant_origin  \
0  ...                0                0                0               0 

In [6]:
#Convertendo entradas e saidas em tensores e criando o DatasetTensor

input = torch.Tensor(input.to_numpy())      
print('\nInput format: ', input.shape, input.dtype)      
output = torch.tensor(output.to_numpy())         
print('Output format: ', output.shape, output.dtype)   
data = TensorDataset(input, output) 


Input format:  torch.Size([3897, 36]) torch.float32
Output format:  torch.Size([3897]) torch.int64


In [7]:
#Dividir para treinar, validar e testar conjuntos usando random_split

train_batch_size = 10        
number_rows = len(input)    
test_split = int(number_rows*0.3)  
validate_split = int(number_rows*0.2) 
train_split = number_rows - test_split - validate_split     
train_set, validate_set, test_set = random_split( 
    data, [train_split, validate_split, test_split])

In [8]:
#Criando Dataloader para ler dados em tamanhos de lote e colocá-los na memória

train_loader = DataLoader(train_set, batch_size = train_batch_size, shuffle = True) 
validate_loader = DataLoader(validate_set, batch_size = 1) 
test_loader = DataLoader(test_set, batch_size = 1)

In [9]:
#Define parâmetros do modelo

input_size = list(input.shape)[1]    
learning_rate = 0.01 
output_size = len(label)

In [10]:
#Definindo a rede neural

class Network(nn.Module): 
   def __init__(self, input_size, output_size): 
       super(Network, self).__init__() 
        
       self.layer1 = nn.Linear(input_size, 24) 
       self.layer2 = nn.Linear(24, 24) 
       self.layer3 = nn.Linear(24, output_size) 


   def forward(self, x): 
       x1 = F.relu(self.layer1(x)) 
       x2 = F.relu(self.layer2(x1)) 
       x3 = self.layer3(x2) 
       return x3 

In [11]:
#Instanciar o modelo

model = Network(input_size, output_size)

In [12]:
#Define o dispositivo de execução

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
print("The model will be running on", device, "device\n") 

#Converte parâmetros do modelo e buffers para CPU ou Cuda

model.to(device)

The model will be running on cpu device



Network(
  (layer1): Linear(in_features=36, out_features=24, bias=True)
  (layer2): Linear(in_features=24, out_features=24, bias=True)
  (layer3): Linear(in_features=24, out_features=3, bias=True)
)

In [13]:
#Função para salvar o modelo

def saveModel(): 
    path = "./NetModel.pth" 
    torch.save(model.state_dict(), path)

In [14]:
#Define a função de perda com perda de entropia cruzada de classificação e um otimizador com Adam optimizer

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [15]:
#Função de treinamento

def train(num_epochs): 
    best_accuracy = 0.0 
     
    print("Begin training...") 
    for epoch in range(1, num_epochs+1): 
        running_train_loss = 0.0 
        running_accuracy = 0.0 
        running_vall_loss = 0.0 
        total = 0 

In [18]:
#Loop de treinamento

for data in train_loader:
    
    inputs, outputs = data   
    optimizer.zero_grad()             
    predicted_outputs = model(inputs)    
    train_loss = loss_fn(predicted_outputs, outputs)    
    train_loss.backward()    
    optimizer.step()        
    running_train_loss +=train_loss.item()
    
    

IndexError: Target 7 is out of bounds.