In [86]:
import torch
import matplotlib.pyplot as plt
import numpy as np

In [87]:
import argparse
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
from dataset import DatasetBuilder
from utils import preprocess_sequences_to_fixed_len, standardize_and_turn_tensor
from itertools import chain
import numpy as np
from argparse import Namespace

In [88]:
import ipdb

In [89]:
class SeqDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X, Y, device='cpu'):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.X = X.to(device)
        self.Y = Y.to(device).long()

    def __len__(self):
        return self.Y.size(0)

    def __getitem__(self, idx):
        sample = {'sequence': self.X[idx, :, :], 'label': self.Y[idx]}
        return sample


def seq_data_to_dataset(seq_data, cap_len, num_features, device, standardize=True):
    X, idx_removed = preprocess_sequences_to_fixed_len(seq_data, cap_len, num_features)
    X = standardize_and_turn_tensor(X, standardize=standardize)
    Y = torch.from_numpy(np.concatenate([x_y[1] for ix, x_y in enumerate(seq_data) if ix not in idx_removed]))
    print(f"generated tensor datasets of size: X{X.size()}, Y{Y.size()}")
    return SeqDataset(X, Y, device)


class LSTMClassifier(nn.Module):
    def __init__(self, input_size, seq_size, h_size, n_classes=2, n_lstm_layers=1, n_linear_layers_hidden=0,
                 dropout=0.):
        super(LSTMClassifier, self).__init__()

        # self.bn = nn.BatchNorm1d(seq_size, affine=False)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=h_size, num_layers=n_lstm_layers,
                            batch_first=True, dropout=dropout,
                            bidirectional=False)
        self.linear = nn.Sequential(
            *chain(*[(nn.Linear(h_size, h_size), nn.ReLU()) for _ in range(n_linear_layers_hidden)]),
            nn.Linear(h_size, n_classes))
        self.n_classes = n_classes

    def forward(self, seq):
        """
        input = seq = torch FloatTensor of size (B, Seqlen, Input_size)
        output is of size (B, hidden_size * num layers) -> concatenation of the last hidden state over the LSTM layers
        returns = the logits of this output passed to a linear layer
        """
        # seq = self.bn(seq)
        out, (_, _) = self.lstm(seq)
        return self.linear(out.mean(1))


def dataset_iterator(dataset, batch_size=32, shuffle=True):
    """
    :param dataset: sequential dataset as returned by create_dataset
    :param batch_size
    :param cap_len: if not None, the cap on the sequence lengths you want to put
    :param shuffle: if you want to shuffle the dataset at the beginning of each epoch
    :return: yield batches for the training, each of the form [(sequence:TorchTensor, sequence_label:TorchTensor)] with len <=batch_size
    each sequence Tensor is of size (<=cap_len or length of the tweets sequence, num_features)
    """
    if shuffle:
        random.shuffle(dataset)
    count_sampled = 0
    n = len(dataset)
    while (count_sampled < n):
        yield dataset[count_sampled:count_sampled + batch_size]
        count_sampled += batch_size

In [90]:
def train(args, model, optim, train_loader, test_loader=None):
    epoch_ckp = 0
    global_step = 0
    accuracies = np.zeros(5, dtype=float)
    max_running_mean = 0.
    # Training phase
    loss_function = nn.CrossEntropyLoss(reduction='mean')
    for epoch in range(epoch_ckp, epoch_ckp + args.num_epochs):
        model.train()
        epoch_loss = 0
        running_loss = 0
        for ix, batch in enumerate(train_loader):  # enumerate(dataset_iterator(train_loader, batch_size=args.batch_size, shuffle=True)):
            sequences, ys = batch['sequence'], batch['label']
            logits = model(sequences.float())  # .float())
            
#             ipdb.set_trace()
            loss = loss_function(logits, ys)
            # Optimization
            optim.zero_grad()
            loss.backward()
            optim.step()
            # Printing running loss
            epoch_loss += loss.item()
            if not ix:
                running_loss = loss.item()
            else:
                running_loss = running_loss * 0.5 + loss.item() * 0.5
            if args.verbose:
                print(f"Step {ix + 1}, running loss: {running_loss:.4f}")
            # Evaluation on the Validation set every 10 steps
            if global_step % 5 == 0:
                model.eval()
                correct = 0
                n_samples = 0
                with torch.no_grad():
                    for batch in test_loader:
                        sequences, ys = batch['sequence'], batch['label']
                        _, pred = model(sequences.float()).max(dim=1)
                        correct += float(pred.eq(ys).sum().item())
                        n_samples += pred.size(0)
                acc = correct / n_samples
                accuracies = np.concatenate([accuracies[1:], np.array([acc])])
                # print(accuracies)
                
                if accuracies.mean() > max_running_mean:
                    max_running_mean = accuracies.mean()
                model.train()
        if (epoch + 1) % 10 == 0:
            print(f"running mean accuracy is {accuracies.mean():.3f}") 
            print("epoch", epoch, "loss:", epoch_loss / len(train_loader))

    return max_running_mean

In [91]:
parser = {'dataset':"twitter15",
          'lr':0.0005,
          'num_epochs':250,
          'num_lstm_layers':2,
          'num_linear_layers':1,
          'hidden_size':16,
          'dropout':0.5,
          'batch_size':64,
          'debug':0,
          'test_on_train':0,
          'verbose':0,
          'cap_len':40,
          'use_cuda':1,
         'n_classes':4}
args = Namespace(**parser)

In [92]:
dataset_builder = DatasetBuilder(args.dataset, only_binary=True if args.n_classes==2 else False, time_cutoff=1500)
full_dataset = dataset_builder.create_dataset(dataset_type="sequential", standardize_features=False)
val_dataset = full_dataset['val']

if args.debug:
    train_dataset = val_dataset
else:
    train_dataset = full_dataset['train']

Considering 4 classes problem
We consider tweets emitted no later than 1500mins after the root tweet
Len train/val/test 1005 149 336
Oversampling...
Before oversampling: 1490 trees, 1005 train trees
After oversampling: 1490 trees, 1005 train trees
Dataset loaded in 29.419s


In [93]:
device = None
if args.use_cuda and torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [94]:
train_dataset = seq_data_to_dataset(train_dataset, cap_len=args.cap_len, num_features=11, standardize=True, device=device)
val_dataset = seq_data_to_dataset(val_dataset, cap_len=args.cap_len, num_features=11, standardize=True, device=device)
train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size,
                        shuffle=True) if not args.test_on_train else train_loader  # to change if different number of features

Fixed-length preprocessing: lost 0 sequences that were unit-sized, oversampled 12 sequences
Shape of input seq data is ndarry of shape (1005, 40, 11)
generated tensor datasets of size: Xtorch.Size([1005, 40, 11]), Ytorch.Size([1005])
Fixed-length preprocessing: lost 0 sequences that were unit-sized, oversampled 3 sequences
Shape of input seq data is ndarry of shape (149, 40, 11)
generated tensor datasets of size: Xtorch.Size([149, 40, 11]), Ytorch.Size([149])


In [95]:
dico = {key:0 for key in range(4)}
for elt in train_dataset.Y:
    dico[elt.item()] +=1
dico

{0: 246, 1: 246, 2: 255, 3: 258}

In [96]:
model = LSTMClassifier(input_size=11,
                       seq_size=args.cap_len,
                       h_size=args.hidden_size,
                       n_lstm_layers=args.num_lstm_layers,
                       n_linear_layers_hidden=args.num_linear_layers - 1,
                       n_classes=args.n_classes,
                       dropout=args.dropout)
model = model.to(device)

In [97]:
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

In [98]:
perf = train(args, model, optimizer, train_loader, val_loader)

running mean accuracy is 0.412
epoch 9 loss: 1.354085698723793
running mean accuracy is 0.538
epoch 19 loss: 1.1320799365639687
running mean accuracy is 0.540
epoch 29 loss: 1.0716021843254566
running mean accuracy is 0.542
epoch 39 loss: 1.0331277251243591
running mean accuracy is 0.570
epoch 49 loss: 1.0188994593918324
running mean accuracy is 0.558
epoch 59 loss: 0.9835345484316349
running mean accuracy is 0.562
epoch 69 loss: 0.9562350474298
running mean accuracy is 0.565
epoch 79 loss: 0.9322734326124191
running mean accuracy is 0.568
epoch 89 loss: 0.9112190529704094
running mean accuracy is 0.585
epoch 99 loss: 0.8793868571519852
running mean accuracy is 0.573
epoch 109 loss: 0.8617966398596764
running mean accuracy is 0.608
epoch 119 loss: 0.834518376737833
running mean accuracy is 0.626
epoch 129 loss: 0.8110368214547634
running mean accuracy is 0.615
epoch 139 loss: 0.8038013651967049
running mean accuracy is 0.632
epoch 149 loss: 0.7934549264609814
running mean accuracy is 0

In [99]:
perf

0.6644295302013423