In [23]:
import torch
from torch import nn
from torch import utils
import torchtext
from data import dataset
import tweet_utils as tu
import pandas as pd

In [24]:
train_dataset = dataset.DisasterData('train', path='data/')
valid_dataset = dataset.DisasterData('valid', path='data/')
test_dataset = dataset.DisasterData('test', path='data/')
all_dataset = dataset.DisasterData('all', path='data/')

In [25]:
len(train_dataset),train_dataset[6800]

(6851,
 ((9741,
   'Back home they mad cause I chill with the white boys .',
   'Orlando ',
   'tragedy'),
  0))

In [26]:
len(valid_dataset),valid_dataset[680]

(762,
 ((10769,
   "Check out 'Malaysia Confirms Plane Wreckage Is From Flight MH370' at  http://t.co/UB3woZ2UT1",
   'No ID, No VOTE!!!',
   'wreckage'),
  1))

In [27]:
len(test_dataset), test_dataset[3000]

(3263,
 ((9914,
   'RT MMDA: ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied. MMDA T/C on site. T\x89Û_',
   'City of Bacoor, CALABARZON PHL',
   'trouble'),
  None))

In [28]:
vec = torchtext.vocab.GloVe(name='6B', dim=300)

In [29]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 100)
        self.fc3 = nn.Linear(100, 1)

        self.dropout0 = nn.Dropout(.8)
        self.dropout1 = nn.Dropout(.6)
        self.dropout2 = nn.Dropout(.5)

    def forward(self, x):
        x = self.dropout0(x)

        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = nn.functional.relu(x)
        x = self.dropout2(x)

        x = self.fc3(x)

        return x

In [30]:
def collate(batch):
    ids = []
    xs = []
    targets = []

    for ((id, text, location, keyword), target) in batch:
        tokens = tu.process_tweet(text, rm_weblinks=True).split()
        x = sum([vec[token] for token in tokens]) / len(tokens)
        ids.append([id])
        xs.append([float(a) for a in x])
        targets.append([target])

    return torch.tensor(ids), torch.tensor(xs), torch.tensor(targets, dtype=torch.float) if targets[0][0] is not None else None

In [31]:
train_dataloader = utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate)
all_dataloader = utils.data.DataLoader(all_dataset, batch_size=32, shuffle=True, collate_fn=collate)
valid_dataloader = utils.data.DataLoader(valid_dataset, batch_size=32, collate_fn=collate)
test_dataloader = utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=collate)

In [32]:
next(iter(train_dataloader))

(tensor([[1054],
         [9562],
         [5545],
         [3582],
         [4556],
         [8100],
         [1588],
         [7292],
         [1479],
         [5443],
         [1126],
         [7467],
         [7595],
         [4047],
         [9038],
         [3523],
         [9435],
         [1951],
         [6430],
         [8507],
         [8350],
         [7610],
         [ 496],
         [5416],
         [5573],
         [5067],
         [9581],
         [5918],
         [7970],
         [6150],
         [5807],
         [9244]]),
 tensor([[-0.0698,  0.0874, -0.0642,  ...,  0.0155, -0.0085,  0.2092],
         [-0.0569,  0.0186, -0.1048,  ..., -0.1485,  0.0835,  0.4287],
         [-0.0104,  0.1229, -0.0930,  ...,  0.0849, -0.0654,  0.0400],
         ...,
         [-0.0936, -0.0352, -0.0521,  ..., -0.1619, -0.0264,  0.1369],
         [-0.1158, -0.0503, -0.0185,  ..., -0.2187, -0.0465,  0.0735],
         [-0.1914,  0.0110, -0.1207,  ..., -0.1725, -0.0866,  0.0699]]),
 tensor([[0.

In [33]:
def get_accuracy(model, dataloader):
    accurate = 0
    total = 0

    model.eval()

    with torch.no_grad():
        for id, x, y in dataloader:
            y_hat = torch.sigmoid(model.forward(x)).round()
            accurate += float((y_hat == y).sum())
            total += float(x.shape[0])

    return accurate / total


In [34]:
model = Model()

In [35]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [36]:
def train(epochs, model, optimizer, valid_dataloader, train_dataloader, loss_fn=nn.BCELoss(), display_epoch=0):
    print("Initial validation accuracy:", get_accuracy(model, valid_dataloader))

    for i in range(epochs):

        j = 0
        epoch_loss = 0

        model.train()
        for id, x, y in train_dataloader:
            y_hat = torch.sigmoid(model.forward(x))
            loss = loss_fn(y_hat, y)

            j += 1
            epoch_loss += float(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        print('EPOCH', i + 1 + display_epoch, '| loss:',
              round(epoch_loss / j, 4),
              '| valid accuracy:',
              round(get_accuracy(model, valid_dataloader), 4))

In [37]:
train(100, model, optimizer, valid_dataloader, train_dataloader, display_epoch=0)

Initial validation accuracy: 0.5406824146981627
EPOCH 1 | loss: 0.6758 | valid accuracy: 0.6444
EPOCH 2 | loss: 0.6234 | valid accuracy: 0.7454
EPOCH 3 | loss: 0.5911 | valid accuracy: 0.7835
EPOCH 4 | loss: 0.5671 | valid accuracy: 0.7782
EPOCH 5 | loss: 0.567 | valid accuracy: 0.7835
EPOCH 6 | loss: 0.5581 | valid accuracy: 0.7835
EPOCH 7 | loss: 0.5616 | valid accuracy: 0.7782
EPOCH 8 | loss: 0.5576 | valid accuracy: 0.7717
EPOCH 9 | loss: 0.548 | valid accuracy: 0.7979
EPOCH 10 | loss: 0.5432 | valid accuracy: 0.7887
EPOCH 11 | loss: 0.5494 | valid accuracy: 0.7835
EPOCH 12 | loss: 0.5501 | valid accuracy: 0.7887
EPOCH 13 | loss: 0.5499 | valid accuracy: 0.7822
EPOCH 14 | loss: 0.5456 | valid accuracy: 0.7848
EPOCH 15 | loss: 0.5491 | valid accuracy: 0.773
EPOCH 16 | loss: 0.5469 | valid accuracy: 0.7822
EPOCH 17 | loss: 0.5459 | valid accuracy: 0.7848
EPOCH 18 | loss: 0.5401 | valid accuracy: 0.7835
EPOCH 19 | loss: 0.5407 | valid accuracy: 0.7848
EPOCH 20 | loss: 0.5447 | valid a

KeyboardInterrupt: 

### Train the final model

In [None]:
model = Model()

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=2e-2)

In [None]:
train(100, model, optimizer, valid_dataloader, all_dataloader, display_epoch=0)

In [None]:
predictions_df = pd.DataFrame()

for ids, x, _ in test_dataloader:
    y_hat = torch.sigmoid(model.forward(x)).round()
    r = torch.concat([ids, y_hat], dim=1)
    predictions_df = pd.concat([predictions_df, pd.DataFrame(r.detach().numpy(), columns=['id', 'target'])])


In [None]:
predictions_df.target = predictions_df.target.astype(int)
predictions_df.id = predictions_df.id.astype(int)

In [None]:
predictions_df

In [None]:
predictions_df.to_csv('predictions/simple_feedforward_first.csv', index=False)