In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils as utils
import torchtext

from data import dataset
import tweet_utils as tu
import pandas as pd

In [2]:
! python3 data/_shuffle.py data/
train_dataset = dataset.DisasterData('train', path='data/', tweet_prep_fn=tu.process_tweet)
valid_dataset = dataset.DisasterData('valid', path='data/', tweet_prep_fn=tu.process_tweet)
test_dataset = dataset.DisasterData('test', path='data/', tweet_prep_fn=tu.process_tweet)
all_dataset = dataset.DisasterData('all', path='data/', tweet_prep_fn=tu.process_tweet)

data/train.csv successfully shuffled!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._data['text'] = self._data['text'].apply(tweet_prep_fn)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._data['text'] = self._data['text'].apply(tweet_prep_fn)


In [3]:
# load embeddings
EMBEDDING_N = 200
vec = torchtext.vocab.GloVe(name='twitter.27B', dim=EMBEDDING_N)
UNK_T = torch.randn(EMBEDDING_N)
EOS_T = torch.randn(EMBEDDING_N)
PAD_T = [0.] * EMBEDDING_N
vec.unk_init = lambda x: UNK_T

In [4]:
torch.equal(vec['thereisnotimeforfussingandfighting'], UNK_T)

True

In [5]:
next(iter(train_dataset))

((7381,
  'meek mill begging nicki minaj to let him obliterate ovofest nowplaying t.co t.co',
  'Toronto',
  'obliterate'),
 0)

In [6]:
def text_to_vector(text, pad_n=0, add_eos=True, prep_fn=tu.process_tweet):
    if prep_fn:
        tokens = prep_fn(text, rm_weblinks=True).split()
    else:
        tokens = text.split()

    res = [vec[token] for token in tokens]
    if add_eos:
        res += [EOS_T]
    if pad_n > 0:
        res += [PAD_T] * pad_n

    return [[float(v) for v in b] for b in res]

In [7]:
def collate(batch):
    ids = []
    X = []
    targets = []
    max_len = 0

    for ((id, text, location, keyword), target) in batch:
        X.append(text_to_vector(text, prep_fn=None))
        max_len = max(len(X[-1]), max_len)

        ids.append(id)
        targets.append([target])

    for v in X:
        for t in range(len(v), max_len):
            v.append(PAD_T)


    return torch.tensor(ids), torch.tensor(X), torch.tensor(targets, dtype=torch.float) if targets[0][0] is not None else None

In [8]:
train_dataloader = utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate)
all_dataloader = utils.data.DataLoader(all_dataset, batch_size=32, shuffle=True, collate_fn=collate)
valid_dataloader = utils.data.DataLoader(valid_dataset, batch_size=32, collate_fn=collate)
test_dataloader = utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=collate)

In [9]:
class Model(nn.Module):

    def __init__(self, input_d, hidden_d, layers_n=1, bidirectional=False):
        super(Model, self).__init__()

        self.Lstm1 = nn.LSTM(input_size=input_d, hidden_size=hidden_d, num_layers=layers_n//2, batch_first=True, bidirectional=bidirectional)
        self.Lstm2 = nn.LSTM(input_size=hidden_d * (2 if bidirectional else 1), hidden_size=hidden_d, num_layers=layers_n - layers_n//2, batch_first=True, bidirectional=bidirectional)
        self.Fc = nn.Linear(hidden_d * (2 if bidirectional else 1), 1)
        self.Dropout1 = nn.Dropout(0.7)
        self.Dropout2 = nn.Dropout(0.5)
        self.Dropout3 = nn.Dropout(0.5)

    def forward(self, x):

        if len(x.shape) == 2:
            x = x[:,None,:]

        x = self.Dropout1(x)
        x, _ = self.Lstm1(x)

        x = self.Dropout2(x)
        x, _ = self.Lstm2(x)
        x = x[:,-1,:]     # taking the last output for each sequence

        x = self.Dropout3(x)
        x = self.Fc(x)
        return x

In [10]:
model1 = Model(EMBEDDING_N, 50, layers_n=4, bidirectional=True)

In [11]:
model1

Model(
  (Lstm1): LSTM(200, 50, num_layers=2, batch_first=True, bidirectional=True)
  (Lstm2): LSTM(100, 50, num_layers=2, batch_first=True, bidirectional=True)
  (Fc): Linear(in_features=100, out_features=1, bias=True)
  (Dropout1): Dropout(p=0.7, inplace=False)
  (Dropout2): Dropout(p=0.5, inplace=False)
  (Dropout3): Dropout(p=0.5, inplace=False)
)

In [12]:
n_param = 0
for param in model1.parameters():
    n_param += len(param)

print('Parameters number:', n_param)

Parameters number: 6402


In [13]:
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-4)

In [14]:
def get_accuracy(model, dataloader):
    accurate = 0
    total = 0

    model.eval()

    with torch.no_grad():
        for id, x, y in dataloader:
            y_hat = torch.sigmoid(model.forward(x)).round()
            accurate += float((y_hat == y).sum())
            total += float(x.shape[0])

    return accurate / total

In [15]:
def train(epochs, model, optimizer, valid_dataloader, train_dataloader, loss_fn=nn.BCELoss(), display_epoch=0, backup_dir=None, threshold=.82):
    print("Initial validation accuracy:", get_accuracy(model, valid_dataloader))

    max_v_acc = 0.

    for i in range(epochs):

        j = 0
        epoch_loss = 0
        batches = len(train_dataloader)

        accurate = 0
        total = 0

        model.train()
        for b_i, (id, x, y) in enumerate(train_dataloader):
            y_hat = torch.sigmoid(model.forward(x))
            loss = loss_fn(y_hat, y)

            with torch.no_grad():
                accurate += float((y_hat.round() == y).sum())
                total += float(x.shape[0])

            j += 1
            epoch_loss += float(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if b_i % (batches // 12) == 0:
                print('-', end='')

        cur_v_acc = get_accuracy(model, valid_dataloader)
        if backup_dir and cur_v_acc > max_v_acc and cur_v_acc > threshold:
            torch.save(model.state_dict(), backup_dir + "lstm%.4f.pt" % cur_v_acc)
            print(' *', end='')
        else:
            print(' -', end='')

        max_v_acc = max(max_v_acc, cur_v_acc)
        print(' EPOCH %03d' % int(i + 1 + display_epoch),
              '| TLoss: %.4f' % round(epoch_loss / j, 4),
              '| TAccuracy: %.4f' % round(accurate / total, 4),
              '| VAccuracy: %.4f' % cur_v_acc)

In [16]:
train(50, model1, optimizer, valid_dataloader, train_dataloader, display_epoch=0, backup_dir='lstm_backups/')

Initial validation accuracy: 0.4225721784776903
------------- - EPOCH 001 | TLoss: 0.6893 | TAccuracy: 0.5370 | VAccuracy: 0.5774
------------- - EPOCH 002 | TLoss: 0.6820 | TAccuracy: 0.5696 | VAccuracy: 0.5774
------------- - EPOCH 003 | TLoss: 0.6332 | TAccuracy: 0.6244 | VAccuracy: 0.7310
------------- - EPOCH 004 | TLoss: 0.5730 | TAccuracy: 0.7234 | VAccuracy: 0.7782
------------- - EPOCH 005 | TLoss: 0.5383 | TAccuracy: 0.7522 | VAccuracy: 0.8005
------------- - EPOCH 006 | TLoss: 0.5188 | TAccuracy: 0.7593 | VAccuracy: 0.7940
------------- - EPOCH 007 | TLoss: 0.5143 | TAccuracy: 0.7643 | VAccuracy: 0.8058
------------- - EPOCH 008 | TLoss: 0.5039 | TAccuracy: 0.7724 | VAccuracy: 0.8084
------------- - EPOCH 009 | TLoss: 0.5017 | TAccuracy: 0.7682 | VAccuracy: 0.8084
------------- - EPOCH 010 | TLoss: 0.4971 | TAccuracy: 0.7698 | VAccuracy: 0.8123
------------- - EPOCH 011 | TLoss: 0.4913 | TAccuracy: 0.7733 | VAccuracy: 0.8097
------------- - EPOCH 012 | TLoss: 0.4905 | TAccur

In [41]:
train(50, model1, optimizer, valid_dataloader, train_dataloader, display_epoch=100, backup_dir='lstm_backups/')

Initial validation accuracy: 0.8228346456692913
------------- * EPOCH 101 | TLoss: 0.4062 | TAccuracy: 0.8210 | VAccuracy: 0.8268
------------- - EPOCH 102 | TLoss: 0.4026 | TAccuracy: 0.8203 | VAccuracy: 0.8202
------------- - EPOCH 103 | TLoss: 0.4120 | TAccuracy: 0.8165 | VAccuracy: 0.8189
------------- - EPOCH 104 | TLoss: 0.4075 | TAccuracy: 0.8199 | VAccuracy: 0.8228
------------- - EPOCH 105 | TLoss: 0.4115 | TAccuracy: 0.8174 | VAccuracy: 0.8084
------------- - EPOCH 106 | TLoss: 0.4091 | TAccuracy: 0.8175 | VAccuracy: 0.8228
------------- - EPOCH 107 | TLoss: 0.4056 | TAccuracy: 0.8175 | VAccuracy: 0.8163
------------- - EPOCH 108 | TLoss: 0.4087 | TAccuracy: 0.8177 | VAccuracy: 0.8110
------------- - EPOCH 109 | TLoss: 0.4046 | TAccuracy: 0.8260 | VAccuracy: 0.8071
------------- - EPOCH 110 | TLoss: 0.4061 | TAccuracy: 0.8170 | VAccuracy: 0.7992
------------- - EPOCH 111 | TLoss: 0.4029 | TAccuracy: 0.8209 | VAccuracy: 0.8202
------------- - EPOCH 112 | TLoss: 0.3913 | TAccur

KeyboardInterrupt: 

In [17]:
get_accuracy(model1, valid_dataloader)

0.8241469816272966

# Predictions

In [43]:
torch.save(model1.state_dict(),"lstm_backups/backup.pt")

In [25]:
model1.load_state_dict(torch.load("lstm_backups/lstm0.8333.pt"))

<All keys matched successfully>

In [26]:
get_accuracy(model1, valid_dataloader)

0.8307086614173228

In [18]:
predictions_df = pd.DataFrame()

for ids, x, _ in test_dataloader:
    y_hat = torch.sigmoid(model1.forward(x)).round()
    r = torch.concat([ids[:, None], y_hat], dim=1)
    predictions_df = pd.concat([predictions_df, pd.DataFrame(r.detach().numpy(), columns=['id', 'target'])])

In [19]:
predictions_df.target = predictions_df.target.astype(int)
predictions_df.id = predictions_df.id.astype(int)

In [20]:
predictions_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
26,10861,1
27,10865,1
28,10868,1
29,10874,1


In [21]:
predictions_df.to_csv('predictions/LSTM.csv', index=False)

In [22]:
model1.load_state_dict(torch.load("lstm_backups/backup.pt"))

<All keys matched successfully>