In [1]:
import torch
import torchvision
from torch import nn 
#from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image 
import numpy as np
import random
import dataset_utils
from torch.utils.data import Dataset, DataLoader

In [3]:
image_dataset = torchvision.datasets.ImageFolder("image_dataset/")
df = pd.read_csv('final_dataset.csv',index_col=0)

nucleotides = df[['nucleotide','species_name','genus_name','processid','image_urls']]
dna_column = df.loc[:,"nucleotide"]
nucleotides.loc[:,'nucleotide'] = dna_column.apply(dataset_utils.one_hot_encoding)
random.seed(42)

X_train_val, X_test, y_train_val, y_test = dataset_utils.data_split(nucleotides,0.2,random_state=42)
print(y_test)
train_data = X_train_val
train_data['species_name'] = y_train_val

X_train, X_validation, y_train, y_validation = dataset_utils.data_split(train_data,0.2,drop_labels=False,random_state=42)

y_train = y_train.apply(lambda x: image_dataset.class_to_idx[x.replace(' ','_')])
y_test = y_test.apply(lambda x: image_dataset.class_to_idx[x.replace(' ','_')])
y_validation= y_validation.apply(lambda x: image_dataset.class_to_idx[x.replace(' ','_')])

365    Bembidion normannum
292       Bledius gallicus
321       Praxis edwardsii
352        Andrena pilipes
18     Automeris managuana
              ...         
412         Hemiceras losa
413         Hemiceras losa
417     Hemiceras punctata
418         Hemiceras losa
421     Hemiceras punctata
Name: species_name, Length: 9991, dtype: object


In [4]:
class DNAdataset(Dataset):
    def __init__(self, data, targets, transform=None):
        self.data = data
        self.targets = torch.tensor(targets)
        #self.transform = transform
        
    def __getitem__(self, index):
        x = torch.tensor(np.float32(self.data[index][0]))
        y = self.targets[index]
        
        #if self.transform:
        #    x = Image.fromarray(self.data[index].astype(np.uint8).transpose(1,2,0))
        #    x = self.transform(x)
        
        return x, y
    
    def __len__(self):
        return len(self.data)
d_train = DNAdataset(X_train.values, y_train.values)
d_val = DNAdataset(X_validation.values, y_validation.values)

In [5]:
dataloader_train = DataLoader(d_train, batch_size=32)
dataloader_val = DataLoader(d_val, batch_size=32)
dataloaders = {'train':dataloader_train,'val':dataloader_val}
dataset_sizes = {'train': d_train.data.shape[0], 'val':d_val.data.shape[0]}

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
import time
from tempfile import TemporaryDirectory
import os
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs[:,None,:,:]
                    inputs = inputs.to(device)
                    #print(inputs.shape)
                    labels = labels.to(device)
                    

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model

In [10]:
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        #self.linear1 = torch.nn.Linear(658, 200)
        self.conv1 = torch.nn.Conv2d(1,8,(5,1))
        self.activation1 = torch.nn.LeakyReLU()
        self.norm1 = torch.nn.BatchNorm2d(8)
        self.conv2 = torch.nn.Conv2d(8,1,(5,1))
        self.activation2 = torch.nn.LeakyReLU()
        self.norm2 = torch.nn.BatchNorm2d(1)
        #self.conv2 = torch.nn.Conv2d(2, 2,1)
        #self.conv2 = torch.nn.Conv2d(5,1,(3,1))
        #self.activation2 = torch.nn.LeakyReLU()
        #self.norm2 = torch.nn.BatchNorm2d(1)
        self.flat = torch.nn.Flatten()
        self.linear = torch.nn.Linear(3250,1500)
        self.dropout= torch.nn.Dropout(0.30)
        self.activation3 = torch.nn.LeakyReLU()
        self.linear2 = torch.nn.Linear(1500,1050)
        #self.softmax = torch.nn.Softmax()
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.activation1(x)
        x = self.norm1(x)
        x = self.conv2(x)
        x = self.activation2(x)
        x = self.norm2(x)
        #x = self.conv2(x)
        #x = self.activation2(x)
        #x = self.norm2(x)
        x = self.flat(x)
        x = self.linear(x)
        x = self.dropout(x)
        x = self.activation3(x)
        x = self.linear2(x)
        #x = self.softmax(x)
        return x
'''    
    def __init__(self):
        super(TinyModel, self).__init__()
        self.flat = torch.nn.Flatten()
        self.linear1 = torch.nn.Linear(658*5,658*2)
        self.dropout1= torch.nn.Dropout(0.2)
        self.activation1 = torch.nn.LeakyReLU()
        self.linear2 = torch.nn.Linear(658*2,1500)
        self.dropout2= torch.nn.Dropout(0.2)
        self.activation2 = torch.nn.LeakyReLU()
        self.linear3 = torch.nn.Linear(1500,1049)
    def forward(self, x):
        x = self.flat(x)
        x = self.linear1(x)
        x = self.dropout1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.dropout2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        return x
 '''   
tinymodel = TinyModel()
tinymodel.cuda()
optimizer = torch.optim.Adam(tinymodel.parameters(),weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr=0.005,epochs= 25, steps_per_epoch= 10) 

In [11]:
model_parameters = filter(lambda p: p.requires_grad, tinymodel.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

6452657


In [None]:
train_model(tinymodel,torch.nn.CrossEntropyLoss(),optimizer,scheduler)

Epoch 0/24
----------
train Loss: 3.5189 Acc: 0.5011
val Loss: 8.6392 Acc: 0.4305

Epoch 1/24
----------
train Loss: 0.2940 Acc: 0.9606
val Loss: 8.3719 Acc: 0.4657

Epoch 2/24
----------
train Loss: 0.0789 Acc: 0.9856
val Loss: 8.1426 Acc: 0.4686

Epoch 3/24
----------
train Loss: 0.0381 Acc: 0.9931
val Loss: 7.7495 Acc: 0.4693

Epoch 4/24
----------
train Loss: 0.0256 Acc: 0.9954
val Loss: 7.1228 Acc: 0.4702

Epoch 5/24
----------
train Loss: 0.0199 Acc: 0.9969
val Loss: 6.6781 Acc: 0.4702

Epoch 6/24
----------
train Loss: 0.0254 Acc: 0.9961
val Loss: 6.4112 Acc: 0.4696

Epoch 7/24
----------
train Loss: 0.0548 Acc: 0.9907
val Loss: 6.9089 Acc: 0.4668

Epoch 8/24
----------
train Loss: 0.0543 Acc: 0.9906
val Loss: 6.4920 Acc: 0.4700

Epoch 9/24
----------
train Loss: 0.0541 Acc: 0.9910
val Loss: 6.4759 Acc: 0.4675

Epoch 10/24
----------
train Loss: 0.0542 Acc: 0.9914
val Loss: 7.0440 Acc: 0.4684

Epoch 11/24
----------
train Loss: 0.0988 Acc: 0.9849
val Loss: 6.1734 Acc: 0.4671

Ep

In [13]:
torch.save({
            'epoch':24,
            'model_state_dict': tinymodel.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, "checkpoints/firstTinyModel")