In [3]:
import builtins
import torch
import torchtext
import collections
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Processing text

## Method 1: using counter and tokenizer

In [4]:
vocab = None
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

def load_dataset(ngrams=1,min_freq=1):
    global vocab, tokenizer
    print("Loading dataset...")
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    y_train = train_df['target']
    x_train = train_df[['text', 'target']]
    x_test = test_df['text']
    classes = [i+1 for i in set(y_train)]
    print('Building vocab...')
    counter = collections.Counter()
    # simply counts the different words in the dataset
    for line in x_train['text']:
        counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
    vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq)
    x_train = [(x_train["target"][i]+1, x_train["text"][i]) for i in range(len(x_train))]
    return x_train,x_test,classes,vocab

train_dataset, test_dataset, classes, vocab = load_dataset()
print(next(iter(train_dataset)))

def encode(x,voc=None,unk=0,tokenizer=tokenizer):
    v = vocab if voc is None else voc
    return [v.stoi.get(s,unk) for s in tokenizer(x)]

print(encode(train_dataset[0][1]))
# encode maps the words to the numbers


Loading dataset...
Building vocab...
(2, 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all')
[121, 5501, 31, 5, 867, 11, 27, 875, 138, 2933, 5734, 103, 50]


In [5]:
len(vocab)

23547

In [6]:
# Bigram
from torchtext.data.utils import ngrams_iterator

bi_counter = collections.Counter()
for (label, line) in train_dataset:
    bi_counter.update(ngrams_iterator(tokenizer(line),ngrams=2))
bi_vocab = torchtext.vocab.Vocab(bi_counter, min_freq=1)

print(f"Bigram vocab size = {len(bi_vocab)}")

Bigram vocab size = 96729


## Method 2: BagOfWords

In [7]:
vocab_size = len(vocab)
def to_bow(text,bow_vocab_size=vocab_size):
    res = torch.zeros(bow_vocab_size,dtype=torch.float32)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

print(f"sample text:\n{train_dataset[0][1]}")
print(f"\nBoW vector:\n{to_bow(train_dataset[0][1])}")
print([len(to_bow(train_dataset[0][1])), vocab_size])

sample text:
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all

BoW vector:
tensor([0., 0., 0.,  ..., 0., 0., 0.])
[23547, 23547]


In [8]:
# BoW with term frequency - inverse document frequency to account the most - frequent / stop words
N = 1000
df = torch.zeros(vocab_size)
for _,line in train_dataset[:N]:
    for i in set(encode(line)):
        df[i] += 1
def tf_idf(s):
    bow = to_bow(s)
    return bow*torch.log((N+1)/(df+1))

print(tf_idf(train_dataset[0][1]))

tensor([0., 0., 0.,  ..., 0., 0., 0.])


## Use Embedding = 'map words into vectors'(not phrases)

First convert each sentence in a list of words and pad the lists to have the same lenght

In [9]:

def padify(b):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1]) for x in b]
    # first, compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )
    
first_sentence = train_dataset[0][1]
second_sentence = train_dataset[1][1]

f_tokens = encode(first_sentence)
s_tokens = encode(second_sentence)

print(f'First Sentence in dataset:\n{first_sentence}')
print("Length:", len(train_dataset[0][1]))
print(f'\nSecond Sentence in dataset:\n{second_sentence}')
print("Length: ", len(train_dataset[1][1]))

First Sentence in dataset:
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Length: 69

Second Sentence in dataset:
Forest fire near La Ronge Sask. Canada
Length:  38


In [10]:
labels, features = padify(train_dataset)  
print(f'features: {features}')

print(f'\nlength of first sentence: {len(f_tokens)}')
print(f'length of second sentence: {len(s_tokens)}')
print(f'size of features: {features.size()}')

features: tensor([[ 121, 5501,   31,  ...,    0,    0,    0],
        [ 199,   57,  235,  ...,    0,    0,    0],
        [  50, 1834, 1594,  ...,    0,    0,    0],
        ...,
        [2642,    2, 2125,  ...,    0,    0,    0],
        [  85, 1086,   52,  ...,    0,    0,    0],
        [   5,  215,   67,  ...,    0,    0,    0]])

length of first sentence: 13
length of second sentence: 8
size of features: torch.Size([7613, 74])


Pretrained embedding models:

In [12]:
# import gensim.downloader as api
# w2v = api.load('word2vec-google-news-300')
# for w,p in w2v.most_similar('dog'):
#     print(f"{w} -> {p}")
# w2v.word_vec('play')[:20]
# w2v.most_similar(positive=['king','woman'],negative=['man'])[0]

In [13]:
# class EmbedClassifier(torch.nn.Module):
#     def __init__(self, vocab_size, embed_dim, num_class):
#         super().__init__()
#         self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim)
#         self.fc = torch.nn.Linear(embed_dim, num_class)

#     def forward(self, text, off):
#         x = self.embedding(text, off)
#         return self.fc(x)
# #Using Pre-Trained Embeddings in PyTorch
# embed_size = len(w2v.get_vector('hello'))
# print(f'Embedding size: {embed_size}')

# net = EmbedClassifier(vocab_size,embed_size,len(classes))

# print('Populating matrix, this will take some time...',end='')
# found, not_found = 0,0
# for i,w in enumerate(vocab.itos):
#     try:
#         net.embedding.weight[i].data = torch.tensor(w2v.get_vector(w))
#         found+=1
#     except:
#         net.embedding.weight[i].data = torch.normal(0.0,1.0,(embed_size,))
#         not_found+=1

# print(f"Done, found {found} words, {not_found} words missing")
# net = net.to(device)

# RNN

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset, test_dataset, classes, vocab = load_dataset()

Loading dataset...
Building vocab...


In [25]:
def encode(x,voc=None,unk=0,tokenizer=tokenizer):
    v = vocab if voc is None else voc
    return [v.stoi.get(s,unk) for s in tokenizer(x)]

print(encode(train_dataset[0][1]))
# encode maps the words to the numbers
    
def padify(b,voc=None,tokenizer=tokenizer):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
    # compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

print(padify(train_dataset[:4]))

[121, 5501, 31, 5, 867, 11, 27, 875, 138, 2933, 5734, 103, 50]
(tensor([1, 1, 1, 1]), tensor([[  121,  5501,    31,     5,   867,    11,    27,   875,   138,  2933,
          5734,   103,    50,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  199,    57,   235,   856, 21485, 21621,     2,  1376,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [   50,  1834,  1594,    10,     6,  2332,     9,   675,     6,    31,
           134, 20367,    26,  1818,     2,    53,   401,   261,    69,  2332,
             9,   675,  1421,    31,  1081],
        [  837,  4809,  2888,    70,  6451,  2424,   261,  1421,     9,   100,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]]))


In [26]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=padify, shuffle=True)

In [27]:
def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for epoch in range(epoch_size):
        for labels,features in dataloader:
            optimizer.zero_grad()
            features, labels = features.to(device), labels.to(device)
            out = net(features)
            loss = loss_fn(out,labels) #cross_entropy(out,labels)
            loss.backward()
            optimizer.step()
            total_loss+=loss
            _,predicted = torch.max(out,1)
            acc+=(predicted==labels).sum()
            count+=len(labels)
            i+=1
            if i%report_freq==0:
                print(f"{count}: acc={acc.item()/count}")
    return total_loss.item()/count, acc.item()/count

In [28]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.rnn = torch.nn.RNN(embed_dim,hidden_dim,batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, num_class)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.embedding(x)
        x,h = self.rnn(x)
        return self.fc(x.mean(dim=1))

In [29]:
vocab_size = len(vocab)
net = RNNClassifier(vocab_size,64,32,len(classes)).to(device)
train_epoch(net,train_loader, lr=0.001, epoch_size=2)

3200: acc=0.59875
6400: acc=0.64453125
9597: acc=0.6804209648848598
12797: acc=0.7034461201844182


(0.035128498365511705, 0.7143044791803494)

# LSTM

In [30]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data = torch.randn_like(self.embedding.weight.data)-0.5
        self.rnn = torch.nn.LSTM(embed_dim,hidden_dim,batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, num_class)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.embedding(x)
        x,(h,c) = self.rnn(x)
        return self.fc(h[-1])

In [32]:
net = LSTMClassifier(vocab_size,64,32,len(classes)).to(device)
train_epoch(net,train_loader, lr=0.001, epoch_size=2)

3200: acc=0.5671875
6400: acc=0.590625
9597: acc=0.6270709596748985
12797: acc=0.6549972649839806


(0.03754812581583311, 0.6741100748719296)