In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils as utils
import torchtext

from data import dataset
import tweet_utils as tu
import pandas as pd

In [2]:
! python3 data/_shuffle.py data/
train_dataset = dataset.DisasterData('train', path='data/', tweet_prep_fn=tu.process_tweet)
valid_dataset = dataset.DisasterData('valid', path='data/', tweet_prep_fn=tu.process_tweet)
test_dataset = dataset.DisasterData('test', path='data/', tweet_prep_fn=tu.process_tweet)
all_dataset = dataset.DisasterData('all', path='data/', tweet_prep_fn=tu.process_tweet)

data/train.csv successfully shuffled!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._data['text'] = self._data['text'].apply(tweet_prep_fn)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._data['text'] = self._data['text'].apply(tweet_prep_fn)


In [3]:
# load embeddings
EMBEDDING_N = 300
vec = torchtext.vocab.GloVe(name='6B', dim=EMBEDDING_N)
UNK_T = torch.randn(EMBEDDING_N)
EOS_T = torch.randn(EMBEDDING_N)
PAD_T = [0.] * EMBEDDING_N
vec.unk_init = lambda x: UNK_T

In [4]:
torch.equal(vec['thereisnotimeforfussingandfighting'], UNK_T)

True

In [5]:
next(iter(train_dataset))

((1873,
  'the stars are burning i here your voice in my mind',
  '627',
  'burning'),
 0)

In [6]:
def text_to_vector(text, pad_n=0, add_eos=True, prep_fn=tu.process_tweet):
    if prep_fn:
        tokens = prep_fn(text, rm_weblinks=True).split()
    else:
        tokens = text.split()

    res = [vec[token] for token in tokens]
    if add_eos:
        res += [EOS_T]
    if pad_n > 0:
        res += [PAD_T] * pad_n

    return [[float(v) for v in b] for b in res]

In [7]:
def collate(batch):
    ids = []
    X = []
    targets = []
    max_len = 0

    for ((id, text, location, keyword), target) in batch:
        X.append(text_to_vector(text, prep_fn=None))
        max_len = max(len(X[-1]), max_len)

        ids.append(id)
        targets.append([target])

    for v in X:
        for t in range(len(v), max_len):
            v.append(PAD_T)


    return torch.tensor(ids), torch.tensor(X), torch.tensor(targets, dtype=torch.float) if targets[0][0] is not None else None

In [8]:
train_dataloader = utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate)
all_dataloader = utils.data.DataLoader(all_dataset, batch_size=32, shuffle=True, collate_fn=collate)
valid_dataloader = utils.data.DataLoader(valid_dataset, batch_size=32, collate_fn=collate)
test_dataloader = utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=collate)

In [9]:
class Model(nn.Module):

    def __init__(self, input_d, hidden_d, layers_n=1):
        super(Model, self).__init__()

        self.Lstm = nn.LSTM(input_size=input_d, hidden_size=hidden_d, num_layers=layers_n, batch_first=True)
        self.Fc = nn.Linear(hidden_d, 1)
        self.Dropout1 = nn.Dropout(0.5)
        self.Dropout2 = nn.Dropout(0.4)

    def forward(self, x):

        if len(x.shape) == 2:
            x = x[:,None,:]

        x = self.Dropout1(x)
        x, _ = self.Lstm(x)
        x = x[:,-1,:]     # taking the last output for each sequence

        x = self.Dropout2(x)
        x = self.Fc(x)
        return x

In [10]:
model1 = Model(EMBEDDING_N, 100, layers_n=2)

In [11]:
n_param = 0
for param in model1.parameters():
    n_param += len(param)

print('Parameters number:', n_param)

Parameters number: 3202


In [12]:
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-4)

In [13]:
def get_accuracy(model, dataloader):
    accurate = 0
    total = 0

    model.eval()

    with torch.no_grad():
        for id, x, y in dataloader:
            y_hat = torch.sigmoid(model.forward(x)).round()
            accurate += float((y_hat == y).sum())
            total += float(x.shape[0])

    return accurate / total

In [14]:
def train(epochs, model, optimizer, valid_dataloader, train_dataloader, loss_fn=nn.BCELoss(), display_epoch=0):
    print("Initial validation accuracy:", get_accuracy(model, valid_dataloader))

    for i in range(epochs):

        j = 0
        epoch_loss = 0
        batches = len(train_dataloader)

        accurate = 0
        total = 0

        model.train()
        for b_i, (id, x, y) in enumerate(train_dataloader):
            y_hat = torch.sigmoid(model.forward(x))
            loss = loss_fn(y_hat, y)

            with torch.no_grad():
                accurate += float((y_hat.round() == y).sum())
                total += float(x.shape[0])

            j += 1
            epoch_loss += float(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if b_i % (batches // 12) == 0:
                print('-', end='')

        print(' EPOCH %03d' % int(i + 1 + display_epoch),
              '| TLoss: %.4f' % round(epoch_loss / j, 4),
              '| TAccuracy: %.4f' % round(accurate / total, 4),
              '| VAccuracy: %.4f' % round(get_accuracy(model, valid_dataloader), 4))

In [15]:
train(30, model1, optimizer, valid_dataloader, train_dataloader, display_epoch=100)

Initial validation accuracy: 0.6089238845144357
------------- EPOCH 101 | TLoss: 0.6846 | TAccuracy: 0.5662 | VAccuracy: 0.6089
------------- EPOCH 102 | TLoss: 0.6228 | TAccuracy: 0.6479 | VAccuracy: 0.7231
------------- EPOCH 103 | TLoss: 0.5139 | TAccuracy: 0.7634 | VAccuracy: 0.7769
------------- EPOCH 104 | TLoss: 0.4847 | TAccuracy: 0.7824 | VAccuracy: 0.7953
------------- EPOCH 105 | TLoss: 0.4747 | TAccuracy: 0.7888 | VAccuracy: 0.8031
------------- EPOCH 106 | TLoss: 0.4587 | TAccuracy: 0.7943 | VAccuracy: 0.8097
------------- EPOCH 107 | TLoss: 0.4523 | TAccuracy: 0.7952 | VAccuracy: 0.8176
------------- EPOCH 108 | TLoss: 0.4528 | TAccuracy: 0.7958 | VAccuracy: 0.8110
------------- EPOCH 109 | TLoss: 0.4474 | TAccuracy: 0.7945 | VAccuracy: 0.8150
------------- EPOCH 110 | TLoss: 0.4402 | TAccuracy: 0.8085 | VAccuracy: 0.8189
------------- EPOCH 111 | TLoss: 0.4358 | TAccuracy: 0.8076 | VAccuracy: 0.8189
------------- EPOCH 112 | TLoss: 0.4387 | TAccuracy: 0.8032 | VAccuracy:

In [16]:
get_accuracy(model1, valid_dataloader)

0.8057742782152231

# Predictions

In [17]:
predictions_df = pd.DataFrame()

for ids, x, _ in test_dataloader:
    y_hat = torch.sigmoid(model1.forward(x)).round()
    r = torch.concat([ids[:, None], y_hat], dim=1)
    predictions_df = pd.concat([predictions_df, pd.DataFrame(r.detach().numpy(), columns=['id', 'target'])])

In [18]:
predictions_df.target = predictions_df.target.astype(int)
predictions_df.id = predictions_df.id.astype(int)

In [19]:
predictions_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
26,10861,1
27,10865,1
28,10868,1
29,10874,1


In [20]:
predictions_df.to_csv('predictions/LSTM.csv', index=False)