In [1]:
import pandas as pd
import numpy as np
from pytorch_lightning.utilities.types import OptimizerLRScheduler
from tqdm.auto import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics import Accuracy
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted')
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 16, 10

# Reproducibility
RANDOM_SEED = 42
pl.seed_everything(42)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [2]:
X_train = pd.read_csv("./data/career-con-2019/X_train.csv")
y_train = pd.read_csv("./data/career-con-2019/y_train.csv")

In [3]:
y_train.surface.value_counts().plot(kind='bar')
plt.xticks(rotation=30)
plt.show()
plt.close()

# Data Pre-processing

- **Integers to strings**

In [4]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_train.surface)

for enc_label, label_name in zip(encoded_labels, label_encoder.classes_):
    print(f"Original Label -> encoded label: {label_name} -> {enc_label}")

In [5]:
y_train["label"] = encoded_labels
y_train.head()

In [6]:
X_train.head()

In [7]:
FEATURE_COLUMNS = X_train.columns.tolist()[3:]
FEATURE_COLUMNS

In [8]:
# Checking series or sequences are split or not
(X_train.series_id.value_counts() == 128).sum() == len(y_train)  # True

In [9]:
sequences = []
for series_id, group in X_train.groupby("series_id"):
    sequence_features = group[FEATURE_COLUMNS]
    label = y_train[y_train.series_id == series_id].iloc[0].label
    
    sequences.append((sequence_features, label))

In [10]:
sequences[0]

In [11]:
# Sequences are divided into train and test
train_sequences, test_sequences = train_test_split(sequences, test_size=0.2)
print(len(train_sequences), len(test_sequences))

# Creating dataset

In [12]:
class SurfaceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]
        return dict(sequence=torch.Tensor(sequence.to_numpy()), label=torch.tensor(label).long())

In [13]:
class SurfaceDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size
        
    def setup(self, stage=None):
        self.train_dataset = SurfaceDataset(self.train_sequences)
        self.test_dataset = SurfaceDataset(self.test_sequences)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=cpu_count())
    
    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=cpu_count())
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=cpu_count())

In [14]:
N_EPOCHS = 250
BATCH_SIZE = 64

data_module = SurfaceDataModule(train_sequences, test_sequences, BATCH_SIZE)

# Model

In [15]:
class SequenceModel(nn.Module):
    def __init__(self, n_features, num_classes, n_hidden=256, n_layers=3):
        super().__init__()
        self.lstm = nn.LSTM(input_size=n_features, hidden_size=n_hidden, num_layers=n_layers, batch_first=True, dropout=0.75)
        self.classifier = nn.Linear(n_hidden, num_classes)
    
    def forward(self, x):
        self.lstm.flatten_parameters()  # For Multi-GPU purposes we flatten parameters
        _, (hidden, _) = self.lstm(x)
        
        # Since it is the multi-layer net, take last output of the last state of the last layer
        out = hidden[-1]
        return self.classifier(out)
        

In [16]:
from torchmetrics.classification import MulticlassAccuracy


class SurfacePredictor(pl.LightningModule):
    def __init__(self, n_features, num_classes):
        super().__init__()
        self.model = SequenceModel(n_features=n_features, num_classes=num_classes)
        self.criterion = nn.CrossEntropyLoss()
        
        self.train_accuracy = MulticlassAccuracy(num_classes=num_classes)
        self.val_accuracy = MulticlassAccuracy(num_classes=num_classes)
        self.test_accuracy = MulticlassAccuracy(num_classes=num_classes)

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        
        self.train_accuracy.update(predictions, labels)

        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.log("train_accuracy", self.train_accuracy, prog_bar=True, logger=True)

        return {"loss": loss, "accuracy": self.train_accuracy}

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        self.val_accuracy.update(predictions, labels)

        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_accuracy", self.val_accuracy, prog_bar=True, logger=True)

        return {"loss": loss, "accuracy": self.val_accuracy}

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        self.test_accuracy.update(predictions, labels)

        self.log("test_loss", loss, prog_bar=True, logger=True)
        self.log("test_accuracy", self.test_accuracy, prog_bar=True, logger=True)

        return {"loss": loss, "accuracy": self.test_accuracy}

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.0001)

In [17]:
model = SurfacePredictor(n_features=len(FEATURE_COLUMNS), num_classes=len(label_encoder.classes_))

In [18]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [19]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="surface")
trainer = pl.Trainer(logger=logger, callbacks=checkpoint_callback, max_epochs=N_EPOCHS, enable_progress_bar=True)

In [20]:
torch.set_float32_matmul_precision('medium')
trainer.fit(model, data_module)

In [21]:
# Final result
test_loader = data_module.test_dataloader()
trainer.test(dataloaders=test_loader)

In [22]:
# Predictions
trained_model = SurfacePredictor.load_from_checkpoint(trainer.checkpoint_callback.best_model_path,
                                                      n_features=len(FEATURE_COLUMNS),
                                                      num_classes=len(label_encoder.classes_))

trained_model.freeze()

In [23]:
# Soring new data
test_dataset = SurfaceDataset(test_sequences)

predictions = []
labels = []

for item in tqdm(test_dataset):
    sequence = item["sequence"]
    label = item['label']
    
    _, output = trained_model(sequence.unsqueeze(dim=0).to('cuda'))
    prediction = torch.argmax(output, dim=1)
    predictions.append(prediction.item())

    labels.append((label.item()))

In [24]:
print(classification_report(labels, predictions, target_names=label_encoder.classes_))

In [25]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues')
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.yaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    
    plt.ylabel('True surface')
    plt.xlabel('Predicted surface')

In [26]:
cm = confusion_matrix(labels, predictions)
df_cm = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)
df_cm

In [27]:
show_confusion_matrix(df_cm)