In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim

import os

import numpy as np
import re

import torchtext

from torchtext import data
from torchtext.data import Iterator, BucketIterator

In [2]:
TEXT = data.Field(batch_first=True)
TARGETS = data.Field(sequential=False, tensor_type=torch.DoubleTensor, batch_first=True, use_vocab=False, postprocessing=data.Pipeline(lambda x: float(x)))
fields = [('targets', TARGETS), ('text', TEXT)]

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.class_size = 10
        self.conv_layer_1 =  nn.Conv2d(1, 14, kernel_size=11, stride=2, padding=1, dilation=1)
        #self.pool = nn.MaxPool2d(2, 2)
        # self.conv_layer_2 = nn.Conv2d(16, 10, (3, 3), 3, 3)
        self.conv_layer_2 = nn.Conv2d(14, 10, kernel_size=11, stride=2, padding=1, dilation=1)
        self.conv_layer_3 = nn.Conv2d(10, 10, kernel_size=1)
        
    def forward(self, input_image):
        result_1 = F.relu(self.conv_layer_1(input_image))
        #print(f'{result_1.shape}: {result_1}')
        result_2 = F.relu(self.conv_layer_2(result_1))
        result_3 = F.relu(self.conv_layer_3(result_2))
        #return F.softmax(result_3, dim=1).squeeze(2).squeeze(2)
        return F.log_softmax(result_3, dim=1).squeeze(2).squeeze(2)
    
    


In [3]:
device = 0

model = Model().to(device)


In [7]:
data_pos = 'txt_sentoken/pos'
data_neg = 'txt_sentoken/neg'


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def read_data(data_path, label):
    data = []
    labels = []
    for file in os.listdir(data_path):
        text = clean_str(open(os.path.join(data_path, file), 'r').read())
        label
        
    labels = label * len(data)
    return data, labels

data_pos, labels_pos = read_data(data_pos, [1, 0])
data_neg, labels_neg = read_data(data_neg, [0, 1])

In [7]:
data_pos

['films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this see

In [9]:
import numpy as np
import re




In [26]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data[0].to(device), target.to(device)
        #print(f'CCC {type(data)}')
        optimizer.zero_grad()
        output = model(data)
        # print(f'CICCIO {output.shape}, {target.shape}')
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 50 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


In [5]:
path = torchtext.datasets.IMDB.download('.data')

inputs = data.Field(lower=True, include_lengths=True, batch_first=True)
labels = data.Field(sequential=False)

train_data = torchtext.datasets.IMDB(os.path.join(path, 'train'), inputs, labels)

In [61]:
train_set, test_set = train_data.iters()

In [6]:
device = 0
model = Model().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9, nesterov=True)
train(model, device, train_data.examples, optimizer, 1)

TypeError: cannot unpack non-iterable Example object

In [7]:

TEXT.build_vocab(train_data)
train_iter = data.Iterator(train_data.examples, batch_size=1, sort_key=lambda x: len(x.text), shuffle=True)
texts = list(map(lambda l: l.text, train_data.examples))
TEXT.build_vocab(texts)

In [88]:
list(texts)

[['a',
  'woman',
  'who',
  'hates',
  'cats',
  '(alice',
  'krige)',
  'and',
  'her',
  'son',
  '(brian',
  'krause)',
  'have',
  'moved',
  'into',
  'a',
  'small',
  'town,',
  'and',
  'must',
  'deal',
  'with',
  'a',
  'mean',
  'teacher',
  '(glenn',
  'shadix),',
  'their',
  'incestuous',
  'relationship,',
  'a',
  'lovely',
  'girl',
  '(mädchen',
  'amick)',
  'and',
  'one',
  'hell',
  'of',
  'a',
  'big',
  'secret.<br',
  '/><br',
  '/>okay,',
  'so',
  'technically,',
  'this',
  'is',
  'a',
  '"bad',
  'film".',
  'but,',
  'who',
  'cares?',
  "it's",
  'so',
  'very',
  'fun!',
  '<br',
  '/><br',
  '/>impossible',
  'things',
  '(involving',
  'corn)',
  'happen,',
  'people',
  'freak',
  'out',
  'about',
  'kitty',
  'cats,',
  "there's",
  'bad',
  'one-liners,',
  "there's",
  'too',
  'much',
  'cheese',
  'to',
  'handle!<br',
  '/><br',
  '/>so,',
  'yes.',
  'you',
  'will',
  'enjoy',
  'this.',
  'a',
  'lot.',
  'it',
  "won't",
  'move',
  'yo

In [8]:
train_set, test_set = torchtext.datasets.IMDB.splits(TEXT, TARGETS)

In [53]:

train_iter = Iterator(train_set, batch_size=8, device=device, sort=False, sort_within_batch=False, repeat=False)

In [17]:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)

trainset, testset = torchtext.datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(trainset ) #get rid of vectors=GloVe(name='6B', dim=300)
LABEL.build_vocab(trainset)

batch_size = 8
train_iter, test_iter = data.BucketIterator.splits(
    (trainset, testset), batch_sizes=(batch_size,batch_size),
    shuffle=False,device=device)

print("train iter shuffle",train_iter.shuffle)
print("test iter shuffle",test_iter.shuffle)

train iter shuffle False
test iter shuffle False


In [9]:
import torch
import torch.nn as nn


class LSTMClassifier(nn.Module):

    def __init__(self, vocab_size, label_size, emb_dim, hidden_size,
                 bidirectional=True, num_layers=1, input_dropout=0.1,
                 dropout=0.0, pad_id=0, eos_id=2, use_cuda=True):
        super(LSTMClassifier, self).__init__()
        self.pad_id = pad_id
        self.eos_id = eos_id
        self.use_cuda = use_cuda
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.input_dropout = nn.Dropout(input_dropout)
        self.rnn = nn.LSTM(input_size=emb_dim,
                           hidden_size=hidden_size,
                           bidirectional=bidirectional,
                           batch_first=True,
                           num_layers=num_layers,
                           dropout=dropout)
        num_directions = 2 if bidirectional else 1
        self.to_label_size = nn.Linear(num_directions*hidden_size, label_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input, is_training=True):
        """
        :param input:       torch.LongTensor; [batch_size, max_len]
        :param is_training: boolean
        :return:
            probs:   torch.FloatTensor; [batch_size, label_size]
        """
        batch_size = input.size(0)
        max_len = input.size(1)
        lengths = []
        for i in range(batch_size):
            length = 0
            while input.data[i][length].cpu().numpy() != self.eos_id:
                length += 1
                if length == max_len - 1:
                    break
            lengths.append(length)

        input_emb = self.embedding(input)
        if is_training:
            input_emb = self.input_dropout(input_emb)
        output, (_, _) = self.rnn(input_emb)  # [batch_size, max_len, hidden_size*num_directions]
        state_array = []
        for i in range(batch_size):
            state_array.append(output[i, lengths[i]].unsqueeze(0))
        states = torch.cat(state_array, dim=0)  # [batch_size, hidden_size*num_directions]
        probs = self.softmax(self.to_label_size(states))
        return probs

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
          feature = layer(feature)
          preds = self.predictor(feature)
        return preds

    
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

    
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)
            
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

em_sz = 100
nh = 500
nl = 3
model2 = SimpleLSTMBaseline(nh, emb_dim=em_sz)
model3 = LSTMClassifier(vocab_size=8, emb_dim=250, label_size=2, hidden_size=1).to(0)
model4 = RNN(150, 128, 2).to(0)

In [18]:
train_batch_it = BatchGenerator(train_iter, 'text', 'label')

In [12]:
train_batch_it

<__main__.BatchGenerator at 0x7f55b6301898>

In [28]:
train(model2, 0, train_batch_it, optimizer, 1)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1535493744281/work/aten/src/THC/generic/THCTensorCopy.cpp:20

In [21]:
l = next(iter(train_batch_it))

In [25]:
l[0]

(tensor([[   3,  282,   36,  ...,    1,    1,    1],
         [   9,   82,  200,  ...,    1,    1,    1],
         [  46,    2, 3917,  ...,    1,    1,    1],
         ...,
         [  36, 2416, 5563,  ..., 1707,   68, 1626],
         [  39,  866,   48,  ...,    1,    1,    1],
         [  17,    3,  301,  ...,    1,    1,    1]], device='cuda:0'),
 tensor([111, 219, 113, 160, 114, 250, 199, 149], device='cuda:0'))