In [43]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from dataset import DatasetBuilder
from utils import preprocess_sequences_to_fixed_len, standardize_and_turn_tensor
from itertools import chain
import numpy as np

In [44]:
class SeqDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X, Y):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.Y.size(0)

    def __getitem__(self, idx):
        sample = {'sequence': self.X[idx, :, :], 'label': self.Y[idx]}
        return sample


def seq_data_to_dataset(seq_data, cap_len, num_features, standardize=True):
    X, idx_removed = preprocess_sequences_to_fixed_len(seq_data, cap_len, num_features)
    X = standardize_and_turn_tensor(X, standardize=False)
    Y = torch.from_numpy(np.concatenate([x_y[1] for ix, x_y in enumerate(seq_data) if ix not in idx_removed]))
    print(f"generated tensor datasets of size: X{X.size()}, Y{Y.size()}")
    return SeqDataset(X, Y)


class LSTMClassifier(nn.Module):
    def __init__(self, input_size, seq_size, h_size, n_classes=4, n_lstm_layers=1, n_linear_layers_hidden=0,
                 dropout=0.):
        super(LSTMClassifier, self).__init__()

        # self.bn = nn.BatchNorm1d(seq_size, affine=False)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=h_size, num_layers=n_lstm_layers,
                            batch_first=True, dropout=dropout,
                            bidirectional=False)
        self.linear = nn.Sequential(
            *chain(*[(nn.Linear(h_size, h_size), nn.ReLU()) for _ in range(n_linear_layers_hidden)]),
            nn.Linear(h_size, n_classes))
        self.n_classes = n_classes

    def forward(self, seq):
        """
        input = seq = torch FloatTensor of size (B, Seqlen, Input_size)
        output is of size (B, hidden_size * num layers) -> concatenation of the last hidden state over the LSTM layers
        returns = the logits of this output passed to a linear layer
        """
        # seq = self.bn(seq)
        out, (_, _) = self.lstm(seq)
        return self.linear(out.mean(1))

### Parameters:

In [45]:
cap_len = 40
hidden_size = 24
num_lstm_layers = 2
num_linear_layers = 1
dropout_prob = 0.5
lr = 0.001
num_epochs = 100
batch_size = 64

In [46]:
def eval_func(model, loader):
    model.eval()
    correct = 0
    n_samples = 0
    with torch.no_grad():
        for batch in loader:  # dataset_iterator(test_loader, batch_size=args.batch_size, shuffle=True):
            sequences, ys = batch['sequence'], batch['label']
            _, pred = model(sequences.float()).max(dim=1)
            correct += float(pred.eq(ys).sum().item())
            n_samples += pred.size(0)
    return correct / n_samples


def train(model, optim, train_loader, val_loader=None):
    # Training phase
    loss_function = nn.CrossEntropyLoss(reduction='mean')
    for epoch in range(num_epochs):

        model.train()
        epoch_loss = 0
        for ix, batch in enumerate(
                train_loader):  # enumerate(dataset_iterator(train_loader, batch_size=args.batch_size, shuffle=True)):
            sequences, ys = batch['sequence'], batch['label']
            logits = model(sequences.float())  # .float())
            loss = loss_function(logits, ys)

            # Optimization
            optim.zero_grad()
            loss.backward()
            optim.step()

            # Printing running loss
            epoch_loss += loss.item()

        if (epoch + 1) % 50 == 0:
            print(
                f"Epoch{epoch + 1}, Loss: {epoch_loss / len(train_loader):.3f}, Accuracy on Val: {eval_func(model, val_loader):.3f}")

In [47]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [48]:
exp_name = 'twitter16'

In [None]:
dataset_builder = DatasetBuilder(exp_name, only_binary=False, time_cutoff=None, features_to_consider="text_only")
full_dataset = dataset_builder.create_dataset(dataset_type="sequential", standardize_features=False)

Considering 4 classes problem
No time consideration
Features that will be considered: text_only
Len train/val/test 552 82 184


In [None]:
train_dataset = full_dataset['train']
val_dataset = full_dataset['val']
test_dataset = full_dataset['test']

In [None]:
for elt in train_dataset:
    try:
        elt[1].shape
    except:
        print(elt)

In [27]:
train_dataset = seq_data_to_dataset(train_dataset, cap_len=cap_len, num_features=768, standardize=False)

Fixed-length preprocessing: lost 0 sequences that were unit-sized, oversampled 0 sequences
Shape of input seq data is ndarry of shape (1005, 40, 768)
generated tensor datasets of size: Xtorch.Size([1005, 40, 768]), Ytorch.Size([1005])


In [28]:
val_dataset = seq_data_to_dataset(val_dataset, cap_len=cap_len, num_features=768, standardize=False)
test_dataset = seq_data_to_dataset(test_dataset, cap_len=cap_len, num_features=768, standardize=False)

Fixed-length preprocessing: lost 0 sequences that were unit-sized, oversampled 0 sequences
Shape of input seq data is ndarry of shape (149, 40, 768)
generated tensor datasets of size: Xtorch.Size([149, 40, 768]), Ytorch.Size([149])
Fixed-length preprocessing: lost 0 sequences that were unit-sized, oversampled 0 sequences
Shape of input seq data is ndarry of shape (336, 40, 768)
generated tensor datasets of size: Xtorch.Size([336, 40, 768]), Ytorch.Size([336])


In [29]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size,
                        shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,
                        shuffle=True)

In [39]:
model = LSTMClassifier(input_size=768,
                       seq_size=cap_len,
                       h_size=hidden_size,
                       n_lstm_layers=num_lstm_layers,
                       n_linear_layers_hidden=num_linear_layers - 1,
                       dropout=dropout_prob)

In [40]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [41]:
train(model, optimizer, train_loader, val_loader)

Epoch50, Loss: 0.412, Accuracy on Val: 0.597
Epoch100, Loss: 0.245, Accuracy on Val: 0.584


In [42]:
print(f"LSTMtext, {exp_name}, {num_epochs}, {batch_size}, {lr}, {hidden_size}, {num_lstm_layers}, {num_linear_layers}, {dropout_prob}, {eval_func(model, train_loader):.4f}, {eval_func(model, val_loader):.4f}, {eval_func(model, test_loader):.4f}")

LSTMtext, twitter15, 100, 64, 0.001, 24, 2, 1, 0.5, 0.8985, 0.5839, 0.6220
