In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

## Load the Dataset

In [2]:
df = pd.read_csv('1. Text Classification/corpus/TweetSentiment.csv')
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.sample(2)

Unnamed: 0,index,text,cleaned_text,sentiment,label
17976,18483,That room will be an oven too,that room will oven too,neutral,0.0
17749,18246,doin fine relaxin at work... Hardly working,doin fine relaxin work hardly working,positive,2.0


## Build Vocabulary

In [3]:
class Vocabulary: 
    def __init__(self, freq_threshold=10, max_size=10000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        self.stoi = {k:j for j, k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    
    def build_vocabulary(self, sentence_list):
        '''
        build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
        (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
        '''
        frequencies = {} 
        idx = 4  # because 4 tokens already added -> (itos) -> {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        # calculate the freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1

 
    def numericalize(self, text):
        '''
        convert the list of words to a list of corresponding indexes
        eg. cat and a dog -> [4, 5, 6, 3]
        '''   
        tokenized_text = self.tokenizer(text)  # tokenize text 
        numericalized_text = []

        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [4]:
# create a vocab class with freq_threshold=1 and max_size=100
VOCAB = Vocabulary(1, 100)

sentence_list = [
    'that is a cat', 'that is not a dog', 'a cat and a dog'
]

# build vocab
VOCAB.build_vocabulary(sentence_list)

print('index to string: ', VOCAB.itos)
print('string to index:', VOCAB.stoi)

print(f'\nnumericalize -> cat and a dog: {VOCAB.numericalize(sentence_list[0])}')
print(f"numericalize -> Hello world: {VOCAB.numericalize('Hello world')}")

index to string:  {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>', 4: 'a', 5: 'that', 6: 'is', 7: 'cat', 8: 'dog'}
string to index: {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3, 'a': 4, 'that': 5, 'is': 6, 'cat': 7, 'dog': 8}

numericalize -> cat and a dog: [5, 6, 4, 7]
numericalize -> Hello world: [3, 3]


## Train Dataset

In [5]:
class TrainDataset(Dataset):

    def __init__(self, df, text_column, label_column, freq_threshold=5, vocab_size=10000):
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_size)
        self.vocab.build_vocabulary(self.texts.tolist())


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [int(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label) 

In [6]:
train_dataset = TrainDataset(
    df = df, text_column = 'cleaned_text', label_column = 'label', 
    freq_threshold = 5, vocab_size = 10000
)

print(f'{df.loc[1]}\n')

text, target = train_dataset[1]
print(text)
print(target)

index                                                        1
text             Sooo SAD I will miss you here in San Diego!!!
cleaned_text             sooo sad will miss you here san diego
sentiment                                             negative
label                                                      1.0
Name: 1, dtype: object

tensor([   1,  376,   86,   42,   66,    6,   64, 1328, 2006,    2])
tensor([1])


## Test Dataset

In [7]:
class TestDataset(Dataset):

    def __init__(self, train_dataset, df, text_column, label_column):
        self.train_dataset = train_dataset
        self.df = df
        
        # get texts and labels
        self.texts = self.df[text_column]
        self.labels = self.df[label_column]
        
        # utilizing vocabulary created using training set
        self.vocab = self.train_dataset.vocab


    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
        text = self.texts[int(index)]
        label = self.labels[index]
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_text = [self.vocab.stoi["<SOS>"]]
        numerialized_text += self.vocab.numericalize(text)
        numerialized_text.append(self.vocab.stoi["<EOS>"])
    
        label = [int(label)]
        
        return torch.tensor(numerialized_text), torch.tensor(label) 

In [8]:
test_dataset = TestDataset(
    train_dataset = train_dataset, df = df, text_column = 'cleaned_text', label_column = 'label'
)

print(f'{df.loc[100]}\n')

text, target = test_dataset[100]
print(text)
print(target)

index                                                         106
text             cool i wear black most of the time when i go out
cleaned_text               cool wear black most the time when out
sentiment                                                 neutral
label                                                         0.0
Name: 100, dtype: object

tensor([  1, 153, 725, 656, 298,   4,  39,  60,  21,   2])
tensor([0])


## Collate Function

In [9]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        source = [item[0] for item in batch] 
        source = pad_sequence(source, batch_first=False, padding_value = self.pad_idx) 
        
        target = torch.tensor([item[1].item() for item in batch])
        return source, target

## Train DataLoader

In [10]:
train_loader = DataLoader(
    dataset = train_dataset, batch_size = 32, num_workers = 1, shuffle = True, pin_memory = True, drop_last = True,
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [11]:
for idx, (texts, labels) in enumerate(train_loader):
    print(texts.shape, labels.shape)
    if idx >= 4:
        break

torch.Size([21, 32]) torch.Size([32])
torch.Size([20, 32]) torch.Size([32])
torch.Size([22, 32]) torch.Size([32])
torch.Size([22, 32]) torch.Size([32])
torch.Size([22, 32]) torch.Size([32])


## Test DataLoader

In [12]:
test_loader = DataLoader(
    dataset = test_dataset, batch_size = 64, num_workers = 1, shuffle = True, pin_memory = True, 
    collate_fn = MyCollate(pad_idx = train_dataset.vocab.stoi["<PAD>"])
)

In [13]:
for idx, (texts, labels) in enumerate(train_loader):
    print(texts.shape, labels.shape)
    if idx >= 4:
        break

torch.Size([22, 32]) torch.Size([32])
torch.Size([21, 32]) torch.Size([32])
torch.Size([23, 32]) torch.Size([32])
torch.Size([21, 32]) torch.Size([32])
torch.Size([21, 32]) torch.Size([32])


## References

In [14]:
# https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00
# https://towardsdatascience.com/custom-datasets-in-pytorch-part-2-text-machine-translation-71c41a3e994e