In [1]:
%load_ext autotime

time: 0 ns


### 載入套件

In [2]:
import torch
import pandas as pd
import numpy as np
from torchtext import data, datasets
import re

time: 6.27 s


In [3]:
# 探索資料
# 可以發現資料為文本與類別，而類別即為正評與負評
input_data = pd.read_csv('./polarity.tsv', delimiter='\t', header=None, names=['text', 'label'])
input_data

Unnamed: 0,text,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",0
1996,"john boorman's "" zardoz "" is a goofy cinematic...",0
1997,the kids in the hall are an acquired taste .it...,0
1998,there was a time when john carpenter was a gre...,0


time: 125 ms


### 建立Pipeline生成資料

In [4]:
# 移除非英文字元
def remove_non_char(x):
    
    x = ' '.join(x)
    x = re.sub("[^a-zA-Z]", " ", x)
    x = x.split()
    
    return x

time: 0 ns


In [5]:
# 建立 Field
text_field = data.Field(sequential=True, dtype=torch.float64, lower=True, tokenize='spacy', tokenizer_language='en_core_web_sm',
                        preprocessing=remove_non_char)

label_field = data.Field(sequential=False)

time: 15.5 s


In [10]:
type(input_data.loc[0,'label'])

numpy.int64

time: 16 ms


In [11]:
# 建立 example
examples = []
for i in input_data.index:
    text = input_data.loc[i,'text']
    label = input_data.loc[i,'label']
    examples.append(data.Example.fromlist(data=[text,label], fields=[('text',text_field), ('label',label_field)]))


time: 4.88 s


In [27]:
# 取的examples並打亂順序
a = np.random.choice(range(len(examples)), size=len(examples), replace=False)
examples = [examples[i] for i in a]

# 以8:2的比例切分examples
train_ex = examples[0:int(len(examples)*0.8)]
test_ex = examples[int(len(examples)*0.8):]

# 建立training與testing dataset
train_data = data.Dataset(examples=train_ex, fields={'text':text_field, 'label':label_field})
test_data = data.Dataset(examples=test_ex, fields={'text':text_field, 'label':label_field})

train_data[0].label, train_data[0].text

(1,
 ['plot',
  'a',
  'bunch',
  'of',
  'bad',
  'guys',
  'dressed',
  'up',
  'as',
  'elvis',
  'impersonators',
  'rob',
  'a',
  'vegas',
  'casino',
  'during',
  'a',
  'presley',
  'convention',
  'the',
  'boys',
  'eventually',
  'get',
  'together',
  'to',
  'split',
  'the',
  'money',
  'but',
  'as',
  'plans',
  'change',
  'double',
  'crosses',
  'occur',
  'dealing',
  'and',
  'wheeling',
  'goes',
  'down',
  'and',
  'the',
  'crew',
  'set',
  'up',
  'for',
  'the',
  'road',
  'who',
  's',
  'on',
  'the',
  'up',
  'and',
  'up',
  'who',
  's',
  'the',
  'real',
  'bad',
  'guy',
  'and',
  'who',
  's',
  'gon',
  'na',
  'get',
  'to',
  'bang',
  'courteney',
  'cox',
  'are',
  'just',
  'a',
  'few',
  'of',
  'the',
  'questions',
  'which',
  'will',
  'be',
  'answered',
  'by',
  'the',
  'rest',
  'of',
  'this',
  'movie',
  'critique',
  'the',
  'funnest',
  'movie',
  'that',
  'i',
  've',
  'seen',
  'so',
  'far',
  'this',
  'year',
  'i

time: 15 ms


In [28]:
# 建立字典
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

print(f"Vocabularies of index 0-5: {text_field.vocab.itos[:10]} \n")
print(f"words to index {text_field.vocab.stoi}")

Vocabularies of index 0-5: ['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 's'] 

time: 375 ms


In [32]:
# create iterator for training and testing data

train_iter = data.Iterator(dataset=train_data, batch_size=3, repeat=False, sort_key=lambda ex:len(ex.text))
test_iter = data.Iterator(dataset=test_data, batch_size=3, repeat=False, sort_key=lambda ex:len(ex.text))


time: 0 ns


In [33]:
for train_batch in train_iter:
    print(train_batch.text, train_batch.text.shape)
    print(train_batch.label, train_batch.label.shape)
    break

tensor([[ 59., 775.,  32.],
        [ 32.,   6., 222.],
        [258., 296.,  24.],
        ...,
        [  1.,   1., 992.],
        [  1.,   1.,  21.],
        [  1.,   1., 236.]], dtype=torch.float64) torch.Size([1298, 3])
tensor([2, 1, 1]) torch.Size([3])
time: 93 ms
