In [54]:
import pandas as pd 

In [73]:
df = pd.read_csv('/content/drive/MyDrive/PROJECTS/PyTorch-NLP-Tutorial/corpus/TweetSentiment.csv')
# df = df[['cleaned_text', 'label']]
df.dropna(inplace=True)
df.sample(2)

Unnamed: 0,text,cleaned_text,sentiment,label
20068,Still in the News Room..however; with audio to...,still the news roomhowever with audio turn aro...,neutral,0.0
16132,SATS what the its just around the corner.,sats what the its just around the corner,neutral,0.0


In [None]:
!pip install pandarallel

In [56]:
#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()


#string manupulation libs
import re
import string
from string import digits
import spacy

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [65]:
frequencies = {'hello':4, 'world':5, 'I':4, 'am':2, 'creating':1, 'nlp':1, 'tutorial':4}
dict(sorted(frequencies.items(), key = lambda x: -x[1])[:-2])

{'world': 5, 'hello': 4, 'I': 4, 'tutorial': 4, 'am': 2}

In [66]:
class Vocabulary: 
    def __init__(self, freq_threshold, max_size):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be included in vocabulary
        max_size : max vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        self.freq_threshold = freq_threshold
        self.max_size = max_size

        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        self.stoi = {k:j for j,k in self.itos.items()} 
          
    
    def __len__(self):
        return len(self.itos)
    

    @staticmethod
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    

    '''
    build the vocabulary: create a dictionary mapping of index to string (itos) and string to index (stoi)
    (itos) -> {5:'the', 6:'a', 7:'an'} | (stoi) -> {'the':5, 'a':6, 'an':7}
    '''
    def build_vocabulary(self, sentence_list):
        frequencies = {} 
        idx = 4  # 4 tokens already added -> (itos) -> {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        # calculate freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                    
        # limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        # limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx = 4 for pad, start, end , unk
            
        # create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1


    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)  # tokenize text
        numericalized_text = []

        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: # out-of-vocab (OOV) words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [67]:
#create a vocab class with freq_threshold=1 and max_size=100
voc = Vocabulary(1, 100)

sentence_list = [
    'that is a cat', 'that is not a dog'
]

#build vocab
voc.build_vocabulary(sentence_list)

print('index to string: ', voc.itos)
print('string to index:', voc.stoi)

print('numericalize -> cat and a dog: ', voc.numericalize('cat and a dog'))

index to string:  {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>', 4: 'that', 5: 'is', 6: 'a'}
string to index: {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3, 'that': 4, 'is': 5, 'a': 6}
numericalize -> cat and a dog:  [3, 3, 6, 3]


In [76]:
class Train_Dataset(Dataset):
    '''
    Initiating Variables
        df: the training dataframe
        source_column: the name of source text column in the dataframe
        target_columns: the name of target text column in the dataframe
        transform: If we want to add any augmentation
        freq_threshold: the minimum times a word must occur in corpus to be treated in vocab
        source_vocab_max_size: max source vocab size
        target_vocab_max_size: max target vocab size
    '''
    
    def __init__(self, df, source_column, target_column, transform=None, 
                 freq_threshold=5, source_vocab_max_size=10000, target_vocab_max_size=10000):
        self.df = df
        self.transform = transform
        
        # get source and target texts
        self.source_texts = self.df[source_column]
        self.target_texts = self.df[target_column]
        
        ## VOCAB class has been created above
        # initialize source vocab object and build vocabulary
        self.source_vocab = Vocabulary(freq_threshold, source_vocab_max_size)
        self.source_vocab.build_vocabulary(self.source_texts.tolist())

        # initialize target vocab object and build vocabulary
        self.target_vocab = Vocabulary(freq_threshold, target_vocab_max_size)
        self.target_vocab.build_vocabulary(self.target_texts.tolist())
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ runs on 1 example at a time. Here, we get an example at index and return its numericalize source and
    target values using the vocabulary objects we created in __init__
    '''
    def __getitem__(self, index):
        source_text = self.source_texts[index]
        target_text = self.target_texts[index]
        
        if self.transform is not None:
            source_text = self.transform(source_text)
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1, 12, 2, 9, 24, 2]
        numerialized_source = [self.source_vocab.stoi["<SOS>"]]
        numerialized_source += self.source_vocab.numericalize(source_text)
        numerialized_source.append(self.source_vocab.stoi["<EOS>"])

        # for seq2seq
        # numerialized_target = [self.target_vocab.stoi["<SOS>"]] 
        # numerialized_target += self.target_vocab.numericalize(target_text)
        # numerialized_target.append(self.target_vocab.stoi["<EOS>"]) 
        
        # for classification
        numerialized_target = [] 
        numerialized_target += self.target_vocab.numericalize(target_text)
        
        #convert the list to tensor and return
        return torch.tensor(numerialized_source), torch.tensor(numerialized_target) 

In [82]:
train_dataset = Train_Dataset(
    df = df, 
    source_column = 'cleaned_text', target_column = 'sentiment',
    freq_threshold = 5, source_vocab_max_size = 1000, target_vocab_max_size = len(set(df['sentiment'].to_list()))
)

print(f'{df.loc[1]}\n')
print(train_dataset.__getitem__(1))

text, label = train_dataset.__getitem__(1)
text.shape, label.shape

text             Sooo SAD I will miss you here in San Diego!!!
cleaned_text             sooo sad will miss you here san diego
sentiment                                             negative
label                                                      1.0
Name: 1, dtype: object

(tensor([  1, 376,  86,  42,  66,   6,  64,   3,   3,   2]), tensor([3]))


(torch.Size([10]), torch.Size([1]))

In [83]:
class Validation_Dataset:
    def __init__(self, train_dataset, df, source_column, target_column, transform = None):
        self.df = df
        self.transform = transform
        
        # train dataset will be used as lookup for vocab
        self.train_dataset = train_dataset
        
        # get source and target texts
        self.source_texts = self.df[source_column]
        self.target_texts = self.df[target_column]
    

    def __len__(self):
        return len(self.df)
    

    def __getitem__(self,index):
        source_text = self.source_texts[index]
        target_text = self.target_texts[index] 

        if self.transform is not None:
            source_text = self.transform(source_text)
            
        # numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1, 12, 2, 9, 24, 2]
        numerialized_source = [self.train_dataset.source_vocab.stoi["<SOS>"]]
        numerialized_source += self.train_dataset.source_vocab.numericalize(source_text)
        numerialized_source.append(self.train_dataset.source_vocab.stoi["<EOS>"])

        # for seq2seq
        # numerialized_target = [self.train_dataset.target_vocab.stoi["<SOS>"]]
        # numerialized_target += self.train_dataset.target_vocab.numericalize(target_text)
        # numerialized_target.append(self.train_dataset.target_vocab.stoi["<EOS>"])

        # for classification
        numerialized_target = []
        numerialized_target += self.train_dataset.target_vocab.numericalize(target_text)

        return torch.tensor(numerialized_source), torch.tensor(numerialized_target) 