In [1]:
import torch
import torchvision
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
dataset = torchvision.datasets.ImageFolder("image_dataset/")
df = pd.read_csv('final_dataset.csv',index_col=0)
df = df[df['species_name']!= 'Agabus sturmii']
df['species_name'].nunique()

1049

In [2]:
def one_hot_encoding(nucleotide: str, seq_len=658) -> np.ndarray:
    # Cutting the sequence if it is longer than a pre-defined value seq_len
    if len(nucleotide) > seq_len:
        nucleotide = nucleotide[:seq_len]
    # Encoding
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    sequence = [mapping[i] if i in mapping else 4 for i in nucleotide]
    encoded_sequence = np.eye(5)[sequence]
    # Padding if the sequence is smaller than a pre-defined value seq_len
    if len(encoded_sequence) < seq_len:
        padding = np.zeros((seq_len - len(encoded_sequence), 5))
        encoded_sequence = np.concatenate((encoded_sequence, padding))
    
    return encoded_sequence

In [3]:
nucleotides = df[['nucleotide','species_name','genus_name']]
colonna_dna = df.loc[:,"nucleotide"]
nucleotides.loc[:,'nucleotide'] = colonna_dna.apply(one_hot_encoding)


In [4]:
print(nucleotides)

                                            nucleotide  \
0    [[0.0, 0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0....   
1    [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....   
2    [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0....   
3    [[0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0....   
4    [[0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0....   
..                                                 ...   
418  [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....   
419  [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....   
420  [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....   
421  [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....   
423  [[0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0....   

               species_name  genus_name  
0         Leucania cruegeri    Leucania  
1             Lestica alata     Lestica  
2    Liotryphon punctulatus  Liotryphon  
3        Lesmone formularis     Lesmone  
4        Lesmone formularis     Lesmone  
..                      ...         ...  
418      

In [5]:
def data_split(df, test_ratio):
    test = []
    genus_count = df.groupby('genus_name')['species_name'].nunique()
    
    for genus_name in genus_count.index:
        number_undescribed_species = genus_count[genus_name]//3
        species = list(df.loc[df['genus_name']==genus_name]['species_name'].unique())
        undescribed_species = random.sample(species,number_undescribed_species)
        test = test+undescribed_species

    df_remaining = df.loc[~df.species_name.isin(test)]
    df_undescribed = df.loc[df.species_name.isin(test)]
    
    y = df_remaining['species_name']
    X = df_remaining.drop(columns=['species_name'])
    
    y_undescribed = df_undescribed['species_name']
    X_undescribed = df_undescribed.drop(columns=['species_name'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)
    
    y_test = pd.concat([y_test,y_undescribed])
    X_test = pd.concat([X_test,X_undescribed])

    return X_train, X_test, y_train, y_test

In [6]:
import random
from sklearn.model_selection import train_test_split

random.seed(42)
X_train_1, X_test, y_train_1, y_test = data_split(nucleotides,0.3)

train_data = X_train_1
train_data['species_name'] = y_train_1

X_train, X_validation, y_train, y_validation = data_split(train_data,0.2)
X_train = X_train.drop(columns=['genus_name'])
X_validation= X_validation.drop(columns=['genus_name'])
X_test = X_test.drop(columns=['genus_name'])

In [7]:
y_train = y_train.apply(lambda x: dataset.class_to_idx[x.replace(' ','_')])
y_test = y_test.apply(lambda x: dataset.class_to_idx[x.replace(' ','_')])
y_validation= y_validation.apply(lambda x: dataset.class_to_idx[x.replace(' ','_')])

# ResNet DNA

In [8]:
from torch.utils.data import Dataset, DataLoader
class DNAdataset(Dataset):
    def __init__(self, data, targets, transform=None):
        self.data = data
        self.targets = torch.tensor(targets)
        #self.transform = transform
        
    def __getitem__(self, index):
        x = torch.tensor(np.float32(self.data[index][0]))
        y = self.targets[index]
        
        #if self.transform:
        #    x = Image.fromarray(self.data[index].astype(np.uint8).transpose(1,2,0))
        #    x = self.transform(x)
        
        return x, y
    
    def __len__(self):
        return len(self.data)

In [9]:
d_train = DNAdataset(X_train.values, y_train.values)
d_val = DNAdataset(X_validation.values, y_validation.values)

In [10]:
dataloader_train = DataLoader(d_train, batch_size=1)
dataloader_val = DataLoader(d_val, batch_size=1)
dataloaders = {'train':dataloader_train,'val':dataloader_val}
dataset_sizes = {'train': d_train.data.shape[0], 'val':d_val.data.shape[0]}

In [11]:
 #inputs, classes = next(iter(dataloader))   
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
import time
from tempfile import TemporaryDirectory
import os
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    #print(inputs.shape)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model

In [13]:
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

#        self.linear1 = torch.nn.Linear(658, 200)
        self.conv1 = torch.nn.Conv2d(1,5,3)
        self.activation = torch.nn.LeakyReLU()
        #self.conv2 = torch.nn.Conv2d(2, 2,1)
        self.conv2 = torch.nn.Conv2d(5,1,(5,1))
        self.activation2 = torch.nn.LeakyReLU()
        self.flat = torch.nn.Flatten()
        self.linear = torch.nn.Linear(652*3,1049)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.activation2(x)
        x = self.flat(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

tinymodel = TinyModel()
tinymodel.cuda()
optimizer = torch.optim.SGD(tinymodel.parameters())
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr=0.01,epochs= 25, steps_per_epoch= 10) 
#torch.optim.lr_scheduler.StepLR(optimizer,10)
#

In [14]:
train_model(tinymodel,torch.nn.CrossEntropyLoss(),optimizer,scheduler)

Epoch 0/24
----------


  return self._call_impl(*args, **kwargs)


train Loss: 6.9556 Acc: 0.0020
val Loss: 6.9556 Acc: 0.0004

Epoch 1/24
----------
train Loss: 6.9556 Acc: 0.0022
val Loss: 6.9556 Acc: 0.0004

Epoch 2/24
----------
train Loss: 6.9556 Acc: 0.0024
val Loss: 6.9556 Acc: 0.0006

Epoch 3/24
----------
train Loss: 6.9556 Acc: 0.0024
val Loss: 6.9556 Acc: 0.0008

Epoch 4/24
----------
train Loss: 6.9556 Acc: 0.0024
val Loss: 6.9556 Acc: 0.0008

Epoch 5/24
----------
train Loss: 6.9555 Acc: 0.0024
val Loss: 6.9556 Acc: 0.0008

Epoch 6/24
----------
train Loss: 6.9555 Acc: 0.0044
val Loss: 6.9556 Acc: 0.0083

Epoch 7/24
----------
train Loss: 6.9538 Acc: 0.0181
val Loss: 6.9505 Acc: 0.0067

Epoch 8/24
----------
train Loss: 6.9435 Acc: 0.0137
val Loss: 6.9505 Acc: 0.0067

Epoch 9/24
----------
train Loss: 6.9435 Acc: 0.0137
val Loss: 6.9505 Acc: 0.0067

Epoch 10/24
----------
train Loss: 6.9435 Acc: 0.0137
val Loss: 6.9505 Acc: 0.0067

Epoch 11/24
----------
train Loss: 6.9435 Acc: 0.0137
val Loss: 6.9505 Acc: 0.0067

Epoch 12/24
----------
t

KeyboardInterrupt: 

In [None]:
X_train.values[0][0].shape