<a href="https://colab.research.google.com/github/nuvard/itorch/blob/master/ildar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install --upgrade git+https://github.com/pytorch/text

Collecting git+https://github.com/pytorch/text
  Cloning https://github.com/pytorch/text to /tmp/pip-req-build-o67lbp4x
Building wheels for collected packages: torchtext
  Building wheel for torchtext (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-domtxu5s/wheels/73/14/71/ed033fd999ae4933e17df3e91be2014e61c2f312a88a164ff5
Successfully built torchtext
Installing collected packages: torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed torchtext-0.4.0


In [0]:
import pandas as pd
import numpy as np
import torch

In [0]:
from torchtext.data import Field
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
 
LABEL = Field(sequential=False, use_vocab=False)

In [0]:
VAL_RATIO = 0.2
def prepare_csv(seed=17):
    df_train = pd.read_csv("df.csv")
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv(
        "df_train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv(
        "df_val.csv", index=False)

In [0]:
prepare_csv()

In [0]:
from torchtext.data import TabularDataset
 
tv_datafields = [
                 ("text", TEXT), # we won't be needing the id, so we pass in None as the field
                 ("label", LABEL),
                 ]
trn, vld = TabularDataset.splits(
               path="itorch", # the root directory where the data lies
               train='df_train.csv', validation="df_val.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)


In [172]:
trn.head

<generator object Dataset.__getattr__ at 0x7f9101d33200>

In [173]:
trn[0]

<torchtext.data.example.Example at 0x7f90efcbbef0>

In [174]:
trn[0].__dict__.keys()

dict_keys(['text', 'label'])

In [175]:
trn[0].text[:3]

['ali', 'bongo', 'sworn']

In [0]:
TEXT.build_vocab(trn)

In [177]:
from torchtext.data import Iterator, BucketIterator
 
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=-1, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [0]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)
 

In [0]:
train_dl = BatchWrapper(train_iter, "text", ["label"])
valid_dl = BatchWrapper(val_iter, "text", ["label"])


In [180]:
next(train_dl.__iter__())

(tensor([[   4,    8,   21,  ...,   73,   37,   38],
         [4708, 2358,  133,  ..., 5034,   47,   47],
         [   5, 8555,  293,  ..., 7973,  325, 6068],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]), tensor([[0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [1.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [1.],
         [1.],
       

In [0]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 1)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [182]:

em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz); model

  "num_layers={}".format(dropout, num_layers))


SimpleBiLSTMBaseline(
  (embedding): Embedding(26985, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=1, bias=True)
)

In [0]:
import tqdm

In [0]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

In [0]:
epochs = 2

In [190]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))




  0%|          | 0/400 [00:00<?, ?it/s][A[A[A


  0%|          | 1/400 [00:00<02:20,  2.83it/s][A[A[A


  0%|          | 2/400 [00:00<02:17,  2.89it/s][A[A[A


  1%|          | 3/400 [00:00<02:06,  3.14it/s][A[A[A


  1%|          | 4/400 [00:01<02:04,  3.18it/s][A[A[A


  1%|▏         | 5/400 [00:01<02:01,  3.24it/s][A[A[A


  2%|▏         | 6/400 [00:01<02:05,  3.13it/s][A[A[A


  2%|▏         | 7/400 [00:02<02:03,  3.19it/s][A[A[A


  2%|▏         | 8/400 [00:02<02:05,  3.12it/s][A[A[A


  2%|▏         | 9/400 [00:02<02:05,  3.13it/s][A[A[A


  2%|▎         | 10/400 [00:03<01:58,  3.29it/s][A[A[A


  3%|▎         | 11/400 [00:03<01:56,  3.34it/s][A[A[A


  3%|▎         | 12/400 [00:03<01:50,  3.50it/s][A[A[A


  3%|▎         | 13/400 [00:03<01:46,  3.62it/s][A[A[A


  4%|▎         | 14/400 [00:04<01:43,  3.72it/s][A[A[A


  4%|▍         | 15/400 [00:04<01:42,  3.76it/s][A[A[A


  4%|▍         | 16/400 [00:04<01:40,  3.83it/s][A[A

Epoch: 1, Training Loss: 0.0700, Validation Loss: 0.0379





  0%|          | 1/400 [00:00<02:20,  2.83it/s][A[A[A


  0%|          | 2/400 [00:00<02:08,  3.11it/s][A[A[A


  1%|          | 3/400 [00:00<01:57,  3.39it/s][A[A[A


  1%|          | 4/400 [00:01<01:49,  3.63it/s][A[A[A


  1%|▏         | 5/400 [00:01<01:45,  3.74it/s][A[A[A


  2%|▏         | 6/400 [00:01<01:42,  3.86it/s][A[A[A


  2%|▏         | 7/400 [00:01<01:46,  3.69it/s][A[A[A


  2%|▏         | 8/400 [00:02<01:50,  3.55it/s][A[A[A


  2%|▏         | 9/400 [00:02<02:00,  3.24it/s][A[A[A


  2%|▎         | 10/400 [00:02<01:56,  3.36it/s][A[A[A


  3%|▎         | 11/400 [00:03<01:51,  3.49it/s][A[A[A


  3%|▎         | 12/400 [00:03<01:49,  3.55it/s][A[A[A


  3%|▎         | 13/400 [00:03<01:45,  3.65it/s][A[A[A


  4%|▎         | 14/400 [00:03<01:41,  3.81it/s][A[A[A


  4%|▍         | 15/400 [00:04<01:39,  3.86it/s][A[A[A


  4%|▍         | 16/400 [00:04<01:44,  3.67it/s][A[A[A


  4%|▍         | 17/400 [00:04<01:49,  3.49it/

Epoch: 2, Training Loss: 0.0194, Validation Loss: 0.0227
CPU times: user 3min 35s, sys: 4.06 s, total: 3min 39s
Wall time: 3min 43s
