In [1]:
# Imports
import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import GloVe
# import torchtext.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
from datetime import datetime

from torchtext.data import get_tokenizer
# from torchtext.data.utils import get_tokenizer

import random
import collections

from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor
from torchtext.experimental.datasets.text_classification import TextClassificationDataset



In [2]:
import csv

with open('spam2.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
#     data = tuple(csv_reader)
    data = [tuple(line) for line in csv_reader]
    
print(data[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', '0'), ('Ok lar... Joking wif u oni...', '0'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", '1'), ('U dun say so early hor... U c already then say...', '0'), ("Nah I don't think he goes to usf, he lives around here though", '0')]


In [3]:
print(f'Number of Examples: {len(data)}')

Number of Examples: 5572


In [4]:
def get_train_test_split(data, split_ratio = 0.66):
    random.shuffle(data)
    
    n_train = int(len(data) * split_ratio)
    train_data = data[:n_train]
    test_data = data[n_train:]
    
    return train_data, test_data

In [5]:
train_data, test_data = get_train_test_split(data)

In [6]:
print(f'Number of Train Examples: {len(train_data)}')
print(f'Number of Test Examples: {len(test_data)}')

Number of Train Examples: 3677
Number of Test Examples: 1895


In [7]:
class Tokenizer:
    def __init__(self, tokenize_fn = 'basic_english', lower = True, max_length = None):
        
        self.tokenize_fn = torchtext.data.utils.get_tokenizer(tokenize_fn)
        self.lower = lower
        self.max_length = max_length
        
    def tokenize(self, s):
        
        tokens = self.tokenize_fn(s)
        
        if self.lower:
            tokens = [token.lower() for token in tokens]
            
        if self.max_length is not None:
            tokens = tokens[:self.max_length]
            
        return tokens

In [8]:
max_length=250

tokenizer = Tokenizer(max_length=max_length)

In [9]:
s = 'ham but not'
print(tokenizer.tokenize(s))

['ham', 'but', 'not']


In [10]:
def build_vocab_from_data(data, tokenizer, **vocab_kwarg):
    
    token_freqs = collections.Counter()
    
    for text, label  in data:
        tokens = tokenizer.tokenize(text)
        token_freqs.update(tokens)
        
    vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwarg)
    
    return vocab

In [11]:
max_size = 25000

vocab = build_vocab_from_data(train_data, tokenizer, max_size = max_size)

In [12]:
print(f'Unique words in vocab: {len(vocab)}')

Unique words in vocab: 7458


In [13]:
vocab.freqs.most_common(5)

[('.', 7348), ('i', 1947), ('to', 1518), ('you', 1468), (',', 1284)]

In [14]:
def data_to_dataset(data, tokenizer, vocab):
    
    data = [(text, label) for (text, label) in data]
    
    text_transform = sequential_transforms(tokenizer.tokenize,
                                                  vocab_func(vocab),
                                                  totensor(dtype=torch.long)
                                          )
    label_transform = sequential_transforms(lambda x: 1 if x =='1' else (0 if x =='0' else x),
                                                  totensor(dtype=torch.long)
                                          )
    
#     label_transform = sequential_transforms(totensor(dtype=torch.long)
#                                           )
    
    transforms = (text_transform, label_transform)
    
    dataset = TextClassificationDataset(data, vocab, transforms)
    
    return dataset

In [15]:
train_dataset = data_to_dataset(train_data, tokenizer, vocab)
test_dataset = data_to_dataset(test_data, tokenizer, vocab)

In [16]:
print(f'Number of Train Examples: {len(train_dataset)}')
print(f'Number of Test Examples: {len(test_dataset)}')

Number of Train Examples: 3677
Number of Test Examples: 1895


In [17]:
indexes, label  = train_dataset[0]
print(indexes)
print(label)

tensor([   3,   75,   87,    4,   38,   18,   52,    3, 2189,    2,    3,  615,
          10,  329,   74,  750,   13, 6634,   13, 1551,    7, 7422,    9])
tensor(0)


In [18]:
class Collator:
    def __init__(self, pad_idx):
        
        self.pad_idx = pad_idx
        
    def collate(self, batch):
        text, labels = zip(*batch)
        labels = torch.LongTensor(labels)
        text = nn.utils.rnn.pad_sequence(text, padding_value=self.pad_idx)
        return labels, text

In [19]:
pad_token = '<PAD>'
pad_idx = vocab[pad_token]
print(pad_idx)
collator = Collator(pad_idx)

0


In [20]:
batch_size = 1

train_loader = torch.utils.data.DataLoader(train_dataset,
                                            batch_size,
                                            shuffle = False,
                                            collate_fn = collator.collate
                                          )

test_loader = torch.utils.data.DataLoader(test_dataset,
                                            batch_size,
                                            shuffle = False,
                                            collate_fn = collator.collate
                                          )

In [21]:
texts, labels = next(iter(train_loader))
print(texts, labels)

tensor([0]) tensor([[   3],
        [  75],
        [  87],
        [   4],
        [  38],
        [  18],
        [  52],
        [   3],
        [2189],
        [   2],
        [   3],
        [ 615],
        [  10],
        [ 329],
        [  74],
        [ 750],
        [  13],
        [6634],
        [  13],
        [1551],
        [   7],
        [7422],
        [   9]])
