**Udemy 7-5. Text Preprocessing**

In [7]:
import torch
import torch.nn as nn
import torchtext.legacy.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [13]:
# Lets make some fake data
data = {
    "label": [0, 1, 1],
    "data": [
             "I like eggs and ham.",
             "Eggs I like!",
             "Ham and eggs or just ham?"
    ]
}

In [14]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [15]:
df.to_csv('thedata.csv', index=False)

In [16]:
!head thedata.csv

label,data
0,I like eggs and ham.
1,Eggs I like!
1,Ham and eggs or just ham?


In [19]:
TEXT = ttd.Field(
    sequential=True, # each value is a sequence of words
    batch_first=True, # Data is of size N x ...
    lower=True, # to lower case the dataset
    tokenize='spacy', # if it is not mentioned, str.split() will be used
    pad_first=True, # pre-padding instead of post-padding
)

LABEL = ttd.Field(
    sequential=False, # Not a sequential
    use_vocab=False, # we don't do preproceesing on targets to convert vocabs to numbers. They are already digits. If it is True then torch build a vocabulary of word in the dataset.
    is_target=True,
)

# Note 1: If you don't specify "use_vocab=False", then pyTorch will complain later
# when you try to iterate over the dataset that the attribute 'vocab' doesn't exist.

# Note 2: If you don't specify "is_target=True", then pyTorch will assume it's part
# of the input, so when you iterate over the dataset it will be like:
# for (inputs, targets), _ in iterator:
# where the 2nd argument "_" should have been the target.

In [22]:
dataset = ttd.TabularDataset(
    path='thedata.csv',
    format='csv', # also could be tsv or json
    skip_header=True, # skip the first row
    fields=[('label', LABEL), ('data', TEXT)] # (name_of_the_field_in_csv_file, object_of_corresponding_field) - Note: must be in order that appears in the file
)

In [23]:
ex = dataset.examples[0] # get examples from the dataset
print(type(ex))

<class 'torchtext.legacy.data.example.Example'>


In [24]:
print(ex.data)
print(ex.label)

# You can see that the '.' is splited from the word 'ham' because we use spacy tokenizer

['i', 'like', 'eggs', 'and', 'ham', '.']
0


In [26]:
train_dataset, test_dataset = dataset.split(split_ratio=0.66) # default split_ratio=0.7

In [29]:
# Internally assign an integer to each token
TEXT.build_vocab(train_dataset)
vocab = TEXT.vocab

['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']
defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7fc6bfd72b10>>, {'<unk>': 0, '<pad>': 1, 'eggs': 2, 'ham': 3, '!': 4, '?': 5, 'and': 6, 'i': 7, 'just': 8, 'like': 9, 'or': 10})


In [30]:
# Note:
# itos and stoi are C-style naming
# vocab.stoi['dog'] = 211
# vocab.itos[211] = 'dog'

print(vocab.itos) # returns a list of vocabs 
print(vocab.stoi) # returns a dictionary of vocab and indices


['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']
defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7fc6bfd72b10>>, {'<unk>': 0, '<pad>': 1, 'eggs': 2, 'ham': 3, '!': 4, '?': 5, 'and': 6, 'i': 7, 'just': 8, 'like': 9, 'or': 10})


In [33]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset),
    sort_key= lambda x: len(x.data), # A function to sort the texts by length
    batch_sizes=(2, 2), # (batch_size_train, batch_size_test)
    device=device, # Automatically transfer the model to device (GPU)
)

In [34]:
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)

inputs: tensor([[ 3,  6,  2, 10,  8,  3,  5],
        [ 1,  1,  1,  2,  7,  9,  4]]) shape: torch.Size([2, 7])
targets: tensor([1, 1]) shape: torch.Size([2])


In [35]:
for inputs, targets in test_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)

inputs: tensor([[7, 9, 2, 6, 3, 0]]) shape: torch.Size([1, 6])
targets: tensor([0]) shape: torch.Size([1])


In [None]:
# Note: '.' is tokenized as unknown that the coressponding value is 0 ('<unk>')
# Pre-padding is done by 1 ('<pad>')

In [None]:
# Exercise: Figure out which sequence of integers goes with which sentence