<a href="https://colab.research.google.com/github/nghoanglong/NLP-Sentiment-Analysis/blob/master/Preprocess_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Requirements
reader: PATH_FOLDER, split_type=['train', 'test', 'dev']
    
   + return: matrix
    
   + với matrix: row - samples dạng sentence
    
embedding: sentences to id

   + sample sentence return array of indices [2, 4, 6, 6]

## Knowledges
+ Word Embedding 
    + frequency-based
        + one-hot vector
        + tf-idf
        + co-occurence matrix
    + prediction-based
        + CBOW
        + Skip-gram
+ Dense vector embedding -> neural net
+ NLTK xử lý sentence

In [1]:
import numpy as np
import os
import nltk
nltk.download('punkt')
from nltk.tree import Tree
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from google.colab import files 
uploaded = files.upload()

Saving train.txt to train (3).txt


In [3]:
class PreprocessData:
    def __init__(self,
                 FOLDER_PATH):
        """Constructor với tham số nhận vào làm FOLDER_PATH

            FOLDER_PATH định dạng ví dụ: dir_current/data_set/
        """
        self.folder_path = FOLDER_PATH
        self.dataset = None
        self.lib_tokens = None

    def load_dataset(self,
                     type_dataset,
                     file_extension='.txt'):
        """Load dataset tương ứng theo type_dataset

            type_dataset = [train, dev, test] | type = string
            file_extension = .txt, .csv,... | type = string

            return ndarray shape(n, 2) với
                row = np.array[list_tokens extract from sentence, label]
        """
        # try:
        #     DATASET_REQ_PATH = self.folder_path + '/' + type_dataset + file_extension
        #     check_exist = os.path.isfile(DATASET_REQ_PATH)
        #     if check_exist:
        DATASET_REQ_PATH = 'train.txt'
        with open(DATASET_REQ_PATH, 'r') as reader:
            self.dataset = np.array([
                                    np.array([self.PTB_tokenize(line.rstrip("\n")),
                                              self.PTB_get_label(line)], dtype=object)
                                    for line in reader])
        return self.dataset
        #     else:
        #         raise FileExistsError('File nay ko ton tai')
        # except FileExistsError as err:
        #     print(err)
        #     return None

    def PTB_get_label(self,
                      treebank):
        """get label của root sentece trong PTB
            treebank - type string
            return label
        """
        tree = Tree.fromstring(treebank)
        return tree.label()

    def PTB_tokenize(self,
                     treebank):
        """Split list các token từ cây PTB
            
            treebank - type string
            return array = [token, token, token,...]
        """
        tree = Tree.fromstring(str(treebank))
        return np.array(tree.leaves())

    def transfrom_sentence(self,
                           li_tokens):
        """Transfrom list các tokens thành 1 sentence hoàn chỉnh

            li_tokens = [token, token, token,...]
            return sentence
        """
        sentence = ' '.join(li_tokens)
        return sentence

    def get_list_vocabularies(self):
        """Tạo một kho các tokens từ list các sentences

            return dictionary{token: ids, token: ids,...}
        """
        li_tokens = set()
        for sample in self.dataset:
            li_tokens.update(sample[0])
        lib_tokens = dict([(token, idx)
                           for idx, token in enumerate(li_tokens)])
        return lib_tokens

    def encode_sentence(self,
                        sent_tokenized,
                        li_vocabs):
        """Encode một sentence về dạng mỗi token tương ứng với một id trong list vocabs

            sent_tokenized - sentence đã được tokenize thành list các tokens
            li_vocabs = {token: id, token: id,...}
            return sentence = [id, id, id, id,...]
        """

        res_encode = np.array([li_vocabs[token] if token in li_vocabs
                               else print('does not have token in li vocabs')
                               for token in sent_tokenized])
        return res_encode

    def decode_sentence(self,
                        li_vocabs):
        pass


In [5]:
data = PreprocessData('./data/trees')
train_data = data.load_dataset('train', '.txt')

li_vocabs = data.get_list_vocabularies() # lấy ra list các vocabs
encode_li_sent = np.array([torch.tensor(data.encode_sentence(sample, li_vocabs))
                            for sample in train_data[:, 0]],
                          dtype=object) # encode list các sentence
print(li_vocabs['The'])
print(encode_li_sent)

5333
[tensor([ 5333,   191, 17411,  7058,  5802,  9120, 17264,  6669, 11607, 10197,
         3352, 14005, 11316, 13162,  2557,  6232,  1884, 10197, 16554,  5802,
        16279, 11860, 14300,  3972, 14253,  1952, 18245,  6124,  8662,  3578,
          832,  2352,  2520, 17991, 14924, 16139])
 tensor([ 5333,   685, 15067, 18012, 12050, 14005,  5333,  7067, 12050, 17264,
         4589, 13162,  5873, 17411, 11451, 17428,  6232, 11860, 17708, 12050,
         7453, 18053,  4262, 14306,  1843,  3044,  3081,  7849, 10197, 17243,
        10482, 12050, 10561,  7941, 10197,  8891, 16139])
 tensor([12468,   411,  1453,  9380, 11860, 16153, 12050, 13299,  5035, 11860,
          950,  6695, 14154,  8662, 11860,   950, 10181, 17388, 14900,  5802,
        17264, 16298,  5035, 18208, 17264, 14936,  7755,  4560,  7665, 17264,
        12674,  8662,  8323,  8662,  8541, 12050, 17264, 11486, 16139])
 ...
 tensor([ 1411,   442,  7330, 17264, 16931, 16028, 17230,  8662, 18208,  4653,
        10197,  3596, 156

In [6]:
# pad all sentence prepare for embedding
from torch.nn.utils.rnn import pad_sequence
padded = pad_sequence(encode_li_sent, batch_first=True)
print(padded)

tensor([[ 5333,   191, 17411,  ...,     0,     0,     0],
        [ 5333,   685, 15067,  ...,     0,     0,     0],
        [12468,   411,  1453,  ...,     0,     0,     0],
        ...,
        [ 1411,   442,  7330,  ...,     0,     0,     0],
        [ 6412,  9114, 13609,  ...,     0,     0,     0],
        [ 5282,  6948, 15157,  ...,     0,     0,     0]])
