In [1]:
import pandas as pd
import numpy as np 

import torch
from torch import nn

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from torch.utils.data import DataLoader,TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.metrics import *


In [2]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

# device in which the model will be trained
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [86]:
df = pd.read_csv("/kaggle/input/milk-dataset/milk_data/all_data_ivium_new_data.csv")
df = df.drop([df.columns[0], 'path'], axis=1)

In [87]:
classif_data = df.drop('concentration', axis=1)
antibiotics = np.unique(df.antibiotic.tolist())

In [88]:
input_features = classif_data.columns[:-1].tolist()
target = "antibiotic"

In [89]:
# import some utilities to transform/preprocess our data:

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encoding categorical features
le = LabelEncoder()

In [90]:
X = classif_data.iloc[:, :-1]
y = pd.DataFrame(le.fit_transform(classif_data['antibiotic']), columns=['antibiotic'])

In [75]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

In [76]:
y_train = np_utils.to_categorical(y_train, 6)
y_val = np_utils.to_categorical(y_val, 6)

In [77]:
train_tensor_dset = TensorDataset(
    torch.tensor(X_train.values, dtype=torch.float),
    torch.tensor(y_train, dtype=torch.float)
)

valid_tensor_dset = TensorDataset(
    torch.tensor(X_val.values, dtype=torch.float),
    torch.tensor(y_val, dtype=torch.float)
)

In [78]:
class SoftOrdering1DCNN(pl.LightningModule):

    def __init__(self, input_dim, output_dim, sign_size=32, cha_input=16, cha_hidden=512, 
                 K=3, dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2):
        super().__init__()

        hidden_size = sign_size*cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size//2
        output_size = (sign_size//4) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        dense1 = nn.Linear(input_dim, hidden_size, bias=False)
        self.dense1 = nn.utils.weight_norm(dense1)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        conv1 = conv1 = nn.Conv1d(
            cha_input, 
            cha_input*K, 
            kernel_size=5, 
            stride = 1, 
            padding=2,  
            groups=cha_input, 
            bias=False)
        self.conv1 = nn.utils.weight_norm(conv1, dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input*K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        conv2 = nn.Conv1d(
            cha_input*K, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv2 = nn.utils.weight_norm(conv2, dim=None)

        # 3rd conv layer
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_hidden)
        conv3 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv3 = nn.utils.weight_norm(conv3, dim=None)
        

        # 4th conv layer
        self.batch_norm_c4 = nn.BatchNorm1d(cha_hidden)
        conv4 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=5, 
            stride=1, 
            padding=2, 
            groups=cha_hidden, 
            bias=False)
        self.conv4 = nn.utils.weight_norm(conv4, dim=None)

        self.avg_po_c4 = nn.AvgPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.dropout2 = nn.Dropout(dropout_output)
        dense2 = nn.Linear(output_size, output_dim, bias=False)
        self.dense2 = nn.utils.weight_norm(dense2)

        self.loss = nn.MSELoss()

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = nn.functional.celu(self.dense1(x))

        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1)

        x = self.batch_norm_c1(x)
        x = nn.functional.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = nn.functional.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = nn.functional.relu(self.conv3(x))

        x = self.batch_norm_c4(x)
        x = self.conv4(x)
        x =  x + x_s
        x = nn.functional.relu(x)

        x = self.avg_po_c4(x)

        x = self.flt(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)

        return x

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        self.log('valid_loss', loss)
        
    def test_step(self, batch, batch_idx):
        X, y = batch
        y_logit = self.forward(X)
        y_probs = y_logit.argmax(1).cpu().numpy()
        loss = self.loss(y_logit, y)
        metric1 = accuracy_score(y.argmax(1).cpu().numpy(), y_probs)
        metric2 = precision_score(y.argmax(1).cpu().numpy(), y_probs, average='macro')
        metric3 = recall_score(y.argmax(1).cpu().numpy(), y_probs, average='macro')
        self.log('test_loss', loss)
        self.log('test_accuracy', metric1)
        self.log('test_precision', metric2)
        self.log('test_recall', metric3)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer, 
                mode="min", 
                factor=0.5, 
                patience=5, 
                min_lr=1e-6),
            'interval': 'epoch',
            'frequency': 1,
            'reduce_on_plateau': True,
            'monitor': 'valid_loss',
        }
        return [optimizer], [scheduler]

In [79]:
model = SoftOrdering1DCNN(
    input_dim=len(input_features), 
    output_dim=6, 
)

early_stop_callback = EarlyStopping(
   monitor='valid_loss',
   min_delta=.0,
   patience=20,
   verbose=True,
   mode='min'
)

trainer = pl.Trainer(callbacks=[early_stop_callback], min_epochs=10, max_epochs=200, gpus=1)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"


In [80]:
trainer.fit(
    model, 
    DataLoader(train_tensor_dset, batch_size=32, shuffle=True, num_workers=4),
    DataLoader(valid_tensor_dset, batch_size=256, shuffle=False, num_workers=4)
)

  cpuset_checked))


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [81]:
trainer.test(model, DataLoader(valid_tensor_dset, batch_size=2048, shuffle=False, num_workers=4))

Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.03573588281869888,
  'test_accuracy': 0.8768115942028986,
  'test_precision': 0.8833864414301354,
  'test_recall': 0.8773830743520766}]

In [62]:
torch.save(model.state_dict(), "cnn_milk_v1.pth")

In [101]:
test = pd.read_csv("/kaggle/input/milk-dataset-test/csv_ivium_new_data_2022-09-12_all_cycles.csv")
test = test.drop([test.columns[0], 'target'], axis=1)

In [103]:
test

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1031,feature_1032,feature_1033,feature_1034,feature_1035,feature_1036,feature_1037,feature_1038,feature_1039,substance
0,-0.000212,-0.000120,-0.000091,-0.000075,-0.000069,-0.000062,-0.000057,-0.000052,-0.000048,-0.000045,...,-0.000046,-0.000046,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,ceftiofur
1,-0.000171,-0.000132,-0.000112,-0.000098,-0.000091,-0.000082,-0.000074,-0.000068,-0.000062,-0.000057,...,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,ceftiofur
2,-0.000050,-0.000048,-0.000046,-0.000045,-0.000044,-0.000043,-0.000042,-0.000041,-0.000040,-0.000039,...,-0.000051,-0.000051,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,ceftiofur
3,-0.000123,-0.000087,-0.000071,-0.000061,-0.000057,-0.000051,-0.000047,-0.000043,-0.000040,-0.000038,...,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,ceftiofur
4,-0.000150,-0.000113,-0.000096,-0.000084,-0.000081,-0.000074,-0.000069,-0.000065,-0.000061,-0.000058,...,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000054,-0.000054,ceftiofur
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,-0.000047,-0.000045,-0.000044,-0.000042,-0.000041,-0.000040,-0.000039,-0.000038,-0.000038,-0.000037,...,-0.000058,-0.000058,-0.000057,-0.000057,-0.000057,-0.000057,-0.000056,-0.000056,-0.000056,tetracycline
1373,-0.000042,-0.000040,-0.000039,-0.000037,-0.000036,-0.000035,-0.000035,-0.000034,-0.000033,-0.000032,...,-0.000063,-0.000063,-0.000062,-0.000061,-0.000061,-0.000060,-0.000060,-0.000059,-0.000059,tetracycline
1374,-0.000162,-0.000125,-0.000108,-0.000097,-0.000093,-0.000086,-0.000080,-0.000074,-0.000070,-0.000065,...,-0.000046,-0.000047,-0.000047,-0.000047,-0.000047,-0.000048,-0.000048,-0.000048,-0.000048,tetracycline
1375,-0.000054,-0.000052,-0.000051,-0.000049,-0.000048,-0.000046,-0.000045,-0.000044,-0.000043,-0.000042,...,-0.000206,-0.000208,-0.000210,-0.000211,-0.000213,-0.000215,-0.000217,-0.000218,-0.000220,tetracycline


In [104]:
test.loc[test.substance == 'streptomycine', 'substance'] = 'streptomycin'

In [105]:
X_test = test.iloc[:, :-1]
y_test = pd.DataFrame(le.transform(test['substance']), columns=['substance'])

In [106]:
test

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1031,feature_1032,feature_1033,feature_1034,feature_1035,feature_1036,feature_1037,feature_1038,feature_1039,substance
0,-0.000212,-0.000120,-0.000091,-0.000075,-0.000069,-0.000062,-0.000057,-0.000052,-0.000048,-0.000045,...,-0.000046,-0.000046,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,-0.000047,ceftiofur
1,-0.000171,-0.000132,-0.000112,-0.000098,-0.000091,-0.000082,-0.000074,-0.000068,-0.000062,-0.000057,...,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,ceftiofur
2,-0.000050,-0.000048,-0.000046,-0.000045,-0.000044,-0.000043,-0.000042,-0.000041,-0.000040,-0.000039,...,-0.000051,-0.000051,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,-0.000050,ceftiofur
3,-0.000123,-0.000087,-0.000071,-0.000061,-0.000057,-0.000051,-0.000047,-0.000043,-0.000040,-0.000038,...,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,-0.000041,ceftiofur
4,-0.000150,-0.000113,-0.000096,-0.000084,-0.000081,-0.000074,-0.000069,-0.000065,-0.000061,-0.000058,...,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000053,-0.000054,-0.000054,ceftiofur
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,-0.000047,-0.000045,-0.000044,-0.000042,-0.000041,-0.000040,-0.000039,-0.000038,-0.000038,-0.000037,...,-0.000058,-0.000058,-0.000057,-0.000057,-0.000057,-0.000057,-0.000056,-0.000056,-0.000056,tetracycline
1373,-0.000042,-0.000040,-0.000039,-0.000037,-0.000036,-0.000035,-0.000035,-0.000034,-0.000033,-0.000032,...,-0.000063,-0.000063,-0.000062,-0.000061,-0.000061,-0.000060,-0.000060,-0.000059,-0.000059,tetracycline
1374,-0.000162,-0.000125,-0.000108,-0.000097,-0.000093,-0.000086,-0.000080,-0.000074,-0.000070,-0.000065,...,-0.000046,-0.000047,-0.000047,-0.000047,-0.000047,-0.000048,-0.000048,-0.000048,-0.000048,tetracycline
1375,-0.000054,-0.000052,-0.000051,-0.000049,-0.000048,-0.000046,-0.000045,-0.000044,-0.000043,-0.000042,...,-0.000206,-0.000208,-0.000210,-0.000211,-0.000213,-0.000215,-0.000217,-0.000218,-0.000220,tetracycline


In [107]:
y_test = np_utils.to_categorical(y_test, 6)

In [108]:
test_tensor_dset = TensorDataset(
    torch.tensor(X_test.values, dtype=torch.float),
    torch.tensor(y_test, dtype=torch.float)
)

In [110]:
trainer.test(model, DataLoader(test_tensor_dset, batch_size=2048
                               , shuffle=False, num_workers=4))

  cpuset_checked))


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.02033277601003647,
  'test_accuracy': 0.9448075526506899,
  'test_precision': 0.9465857719393673,
  'test_recall': 0.9437520128824477}]