In [1]:
import torch
import numpy as np
import torch.nn as nn
import pandas as pd
import re
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
import torchvision.transforms as transforms
from torchvision.io import read_image
from torch.utils.data import Dataset
from sklearn.model_selection import KFold
import random

In [2]:
class CDS_1D_Dataset(Dataset):
    def __init__(self, annotations_file, spec_dir, transform=None, target_transform=None, sep = ","):
        self.spec_labels = pd.read_csv(annotations_file, sep=',').iloc[:,1:]
        self.spec_number = pd.read_csv(annotations_file, sep=',').iloc[:,0]
        self.spec_dir = spec_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.spec_labels)

    def __getitem__(self, idx):
        label = self.spec_labels.iloc[idx]
        spec = torch.from_numpy(np.array(pd.read_csv(self.spec_dir + str(self.spec_number[idx])+'.csv').iloc[1:,1:-1], dtype='float32')).unsqueeze(0)
        spec = torch.reshape(spec, (41,500))
        if self.transform:
            spec = self.transform(spec)
        if self.target_transform:
            label = self.target_transform(label)
        return spec, torch.from_numpy(np.array(label, dtype='float32'))

In [None]:
path_to_data = 'Desktop/Gala_Salts/full_data/' #Where to find data
path_to_out = 'Desktop/Gala_Salts/split_3/' #Where to write out files

In [3]:
#Data import. Used to calculate mean and std
training_data = CDS_1D_Dataset(path_to_out+'Y_trn.csv', path_to_data)
validation_data = CDS_1D_Dataset(path_to_out+'Y_vld.csv',path_to_data)
test_data = CDS_1D_Dataset(path_to_out+'Y_tst.csv',path_to_data)

from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [5]:
#Calculate mean and std
mean = 0.0
for specs, _ in train_dataloader:
    batch_samples = specs.size(0) 
    mean += specs.mean(2).sum(0)
mean = mean / len(train_dataloader.dataset)

var = 0.0
for specs, _ in train_dataloader:
    batch_samples = specs.size(0)
    var += ((specs - mean.unsqueeze(1))**2).sum([0,2])
std = torch.sqrt(var / (len(train_dataloader.dataset)*500))

In [6]:
#Custom dataset to perform normalization
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)

class Normalize1D:
    def __init__(self,mean,std):
        assert mean.shape == std.shape
        self.mean = mean
        self.std = std

    def __call__(self,x):
        # x of shape [channel, value]
        assert x.shape[0] == self.mean.shape[0]
        assert len(x.shape) == 2
        mean = self.mean.unsqueeze(-1)
        std = self.std.unsqueeze(-1)
        return (x - std) / mean

In [7]:
#Data import and normalization

training_data = CDS_1D_Dataset(path_to_out+'Y_trn.csv', path_to_data, transform= transforms.Compose([Normalize1D(mean,std)]))
validation_data = CDS_1D_Dataset(path_to_out+'Y_vld.csv', path_to_data, transform= transforms.Compose([Normalize1D(mean,std)]))
test_data = CDS_1D_Dataset(path_to_out+'Y_tst.csv', path_to_data, transform= transforms.Compose([Normalize1D(mean,std)]))

from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [9]:
#1D CNN Class

class OneDCNN(nn.Module):

    def __init__(self):
      
        super().__init__() # since Python 3.0
        

        self.conv1 = nn.Conv1d(in_channels=41, out_channels=6, kernel_size=5)
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(in_channels=6, out_channels=16, kernel_size=5)

        self.fc1 = nn.Linear(16 * 244, 160)
        self.fc2 = nn.Linear(160, 4)

    def forward(self, x):

        x = self.pool(F.relu(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x



In [18]:
#Model instance

N1 = OneDCNN()
print(N1)

OneDCNN(
  (conv1): Conv1d(41, 6, kernel_size=(5,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(6, 16, kernel_size=(5,), stride=(1,))
  (fc1): Linear(in_features=3904, out_features=160, bias=True)
  (fc2): Linear(in_features=160, out_features=4, bias=True)
)


In [19]:
import torch
import torch.optim as optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


# Training

import torch.optim as optim

loss_function = torch.nn.MSELoss().cuda()
optimizer = optim.Adam(N1.parameters(), lr=0.001)

test_stop = 100
max_val_loss = 10000.0

for epoch_step in range(0, 1000, test_stop):
    if epoch_step!=0:
        N1 = OneDCNN()
        optimizer = optim.Adam(N1.parameters(), lr=0.001)

        checkpoint = torch.load(path_to_out + 'model.pth')
        N1.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        last_best_epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        N1.train()
        
        if last_best_epoch + test_stop > ep:
            for ep in range(epoch_step, epoch_step+test_stop):
                for _, data in enumerate(train_dataloader, 0): # get bacth
                    inputs, labels = data # parse batch
                    optimizer.zero_grad() # sets the gradients of all optimized tensors to zero.
                    outputs = N1(inputs) # get outputs
                    loss = loss_function(outputs, labels) # calculate loss
                    loss.backward() # calculate gradients
                    optimizer.step() # performs a single optimization step (parameter update).

                for specs, labels in validation_dataloader: val_loss = loss_function(N1(specs),labels)
                if val_loss.item() <= max_val_loss:
                    torch.save({'epoch': ep,
                                'model_state_dict': N1.state_dict(),
                                'optimizer_state_dict': optimizer.state_dict(),
                                'loss': loss}, path_to_out + 'model.pth')
                    max_val_loss = val_loss.item()
                print(f"Epoch={ep} loss={loss:.4f} val_loss={val_loss:.4f}")
        else: 
            continue 

    if epoch_step==0:
        for ep in range(epoch_step, test_stop):
            for _, data in enumerate(train_dataloader, 0): # get bacth
                inputs, labels = data # parse batch
                optimizer.zero_grad() # sets the gradients of all optimized tensors to zero.
                outputs = N1(inputs) # get outputs
                loss = loss_function(outputs, labels) # calculate loss
                loss.backward() # calculate gradients
                optimizer.step() # performs a single optimization step (parameter update).
            
            for specs, labels in validation_dataloader: val_loss = loss_function(N1(specs),labels)
            if val_loss.item() <= max_val_loss:
                torch.save({'epoch': ep,
                            'model_state_dict': N1.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss}, path_to_out + 'model.pth')
                max_val_loss = val_loss.item()
            print(f"Epoch={ep} loss={loss:.4f}")
    
   

Using device: cpu
Epoch=0 loss=18.2404
Epoch=1 loss=4.8844
Epoch=2 loss=4.5483
Epoch=3 loss=3.7751
Epoch=4 loss=3.7361
Epoch=5 loss=3.3820
Epoch=6 loss=3.2469
Epoch=7 loss=3.0071
Epoch=8 loss=2.8015
Epoch=9 loss=3.5761
Epoch=10 loss=2.7376
Epoch=11 loss=3.0870
Epoch=12 loss=3.0535
Epoch=13 loss=3.3154
Epoch=14 loss=2.6902
Epoch=15 loss=2.9982
Epoch=16 loss=2.4546
Epoch=17 loss=2.9554
Epoch=18 loss=2.0079
Epoch=19 loss=3.0068
Epoch=20 loss=2.1827
Epoch=21 loss=1.8325
Epoch=22 loss=2.3236
Epoch=23 loss=1.8462
Epoch=24 loss=1.3774
Epoch=25 loss=2.4826
Epoch=26 loss=2.4654
Epoch=27 loss=2.1671
Epoch=28 loss=2.0775
Epoch=29 loss=1.9810
Epoch=30 loss=1.9230
Epoch=31 loss=1.9293
Epoch=32 loss=2.3593
Epoch=33 loss=1.7750
Epoch=34 loss=2.0365
Epoch=35 loss=1.7648
Epoch=36 loss=1.7031
Epoch=37 loss=1.9871
Epoch=38 loss=1.3250
Epoch=39 loss=1.7071
Epoch=40 loss=1.7176
Epoch=41 loss=1.5975
Epoch=42 loss=1.2700
Epoch=43 loss=1.6827
Epoch=44 loss=1.6320
Epoch=45 loss=1.2951
Epoch=46 loss=1.1593
Epoc

In [20]:
#Get statistics
N = OneDCNN()
optimizer = optim.Adam(N.parameters(), lr=0.001)

checkpoint = torch.load(path_to_out + 'model.pth')
N.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
last_best_epoch = checkpoint['epoch']
loss = checkpoint['loss']
N.eval()

y_ae_test = np.zeros((1,4))
for specs, labels in test_dataloader:
    outputs = N(specs)
    outputs[outputs<0]=0
    ae = torch.abs(outputs-labels).detach().numpy()
    np.concatenate((y_ae_test, ae), axis=0)
    y_ae_test = np.concatenate((y_ae_test, ae), axis=0)
    
y_ae_trn = np.zeros((1,4))
for specs, labels in train_dataloader:
    outputs = N(specs)
    outputs[outputs<0]=0
    ae = torch.abs(outputs-labels).detach().numpy()
    np.concatenate((y_ae_trn, ae), axis=0)
    y_ae_trn = np.concatenate((y_ae_trn, ae), axis=0)
    
y_ae_vld = np.zeros((1,4))
for specs, labels in validation_dataloader:
    outputs = N(specs)
    outputs[outputs<0]=0
    ae = torch.abs(outputs-labels).detach().numpy()
    np.concatenate((y_ae_vld, ae), axis=0)
    y_ae_vld = np.concatenate((y_ae_vld, ae), axis=0)

In [21]:
a = ['Cu','Ni','Cr','NO3']
pd.DataFrame(y_ae_test).to_csv(path_to_out + 'Y_out_tst.csv',sep=',', header = a)
pd.DataFrame(y_ae_trn).to_csv(path_to_out + 'Y_out_trn.csv',sep=',', header = a)
pd.DataFrame(y_ae_vld).to_csv(path_to_out + 'Y_out_vld.csv',sep=',', header = a)