In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
	os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)
##train_dataset, test_dataset = torchtext.datasets.AG_NEWS(ngrams=3) 이런방식으로 로딩 가능
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:07, 16887.29lines/s]
120000lines [00:12, 9675.63lines/s] 
7600lines [00:00, 9813.16lines/s]


In [2]:
print(torch.__version__)
print(torchtext.__version__)

1.1.0
0.4.0


In [3]:
'''
dataset example
'''
import pandas as pd

df = pd.read_csv('./.data/ag_news_csv/train.csv', nrows=20)
print(df)

    3  Wall St. Bears Claw Back Into the Black (Reuters)  \
0   3  Carlyle Looks Toward Commercial Aerospace (Reu...   
1   3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
2   3  Iraq Halts Oil Exports from Main Southern Pipe...   
3   3  Oil prices soar to all-time record, posing new...   
4   3        Stocks End Up, But Near Year Lows (Reuters)   
5   3               Money Funds Fell in Latest Week (AP)   
6   3  Fed minutes show dissent over inflation (USATO...   
7   3                            Safety Net (Forbes.com)   
8   3            Wall St. Bears Claw Back Into the Black   
9   3              Oil and Economy Cloud Stocks' Outlook   
10  3             No Need for OPEC to Pump More-Iran Gov   
11  3          Non-OPEC Nations Should Up Output-Purnomo   
12  3              Google IPO Auction Off to Rocky Start   
13  3           Dollar Falls Broadly on Record Trade Gap   
14  3                              Rescuing an Old Saver   
15  3                       Kids Rule fo

In [4]:
import torch.nn as nn
import torch.nn.functional as F

'''
EmbeddingBag params
vocab_size = num_embeddings (int) – size of the dictionary of embeddings
embed_dim = embedding_dim (int) – the size of each embedding vector
sparse (bool, optional) – if True, gradient w.r.t. weight matrix will be a sparse tensor.
''' 


class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [5]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
'''
AG_NEWS classes
1:world
2:sports
3:business
4:sci/tec
'''
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [6]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text) # purpose of torch.cat() is concat... why torch.cat is used???
    
    return text, offsets, label

In [7]:
from torch.utils.data import DataLoader

def train_func(sub_train_):
    
    #Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                     collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()
    
    #growing lr
    scheduler.step()
    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()
            
    return loss / len(data_), acc / len(data_)

In [8]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 31 seconds
	Loss: 0.0261(train)	|	Acc: 84.9%(train)
	Loss: 0.0001(valid)	|	Acc: 89.2%(valid)
Epoch: 2  | time in 0 minutes, 33 seconds
	Loss: 0.0119(train)	|	Acc: 93.6%(train)
	Loss: 0.0001(valid)	|	Acc: 90.3%(valid)
Epoch: 3  | time in 0 minutes, 33 seconds
	Loss: 0.0069(train)	|	Acc: 96.4%(train)
	Loss: 0.0000(valid)	|	Acc: 90.6%(valid)
Epoch: 4  | time in 0 minutes, 33 seconds
	Loss: 0.0039(train)	|	Acc: 98.1%(train)
	Loss: 0.0000(valid)	|	Acc: 91.5%(valid)
Epoch: 5  | time in 0 minutes, 34 seconds
	Loss: 0.0022(train)	|	Acc: 99.0%(train)
	Loss: 0.0001(valid)	|	Acc: 90.4%(valid)


In [9]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0003(test)	|	Acc: 88.1%(test)


In [18]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        print(text)
        return output.argmax(1).item() + 1

vocab = train_dataset.get_vocab()
model = model.to("cpu")

def printResult(plain_text):
    print("predicted result is : " + ag_news_label[predict(plain_text, model, vocab, 2)])


In [31]:
plain_text1 = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."
plain_text2 = "LeCun once told Wired that deep learning is really a conspiracy between\
    Geoff Hinton and myself and Yoshua Bengio, from the University of\
    Montreal. While Hinton works on AI at Google, and Bengio splits time\
    between University of Montreal and data mining company ApStat, LeCun\
    has been able to snag other top-shelf names."
plain_text3 = "Never mind that the Nationals were not prematurely prepping a\
    celebration, or that M.L.B. does similar run-throughs at each team’s stadium\
    during the league championship rounds and World Series. The practice is the \
    opposite of arrogance; it’s all about."
plain_text4 = "Never mind that the Nationals were not prematurely prepping a celebration, or that M.L.B. does similar run-throughs at each team’s stadium during the league championship rounds and World Series. The practice is the opposite of arrogance; it’s all about worrying what can go wrong if the first stage setup happens on live television.\
    Still, the move has been second-guessed almost as much as any pitching choice or umpire’s call. Perhaps that is not surprising in a sport that dwells on superstitions more than any other, turning unfounded rituals into staples of the game.\
    Broadcasters sometimes refuse to acknowledge a no-hitter in progress, and teammates often don’t talk to a pitcher working on one. Many pitchers eat the same food before each game — or consume unusual concoctions, such as banana-mayonnaise sandwiches — to conjure a victory.\
    The teams in this World Series have their share of adherents to such traditions. Nationals General Manager Mike Rizzo donned the same red sweatshirt for each of the eight games in his team’s postseason winning streak, and Astros third baseman Alex Bregman wore the same plaid shirt to the ballpark during his team’s three-game World Series winning streak."

printResult(plain_text3)

tensor([   1330,    4949,      19,       3,    9145,     103,      83,  134201,
         269893,       6,    8779,       4,     156,      19,     797,       2,
           2774,       2,    3328,       2,    1411,    3966,       0,      22,
           1171,       0,     950,     301,       3,     236,     554,    6324,
              9,      61,     317,       2,       3,    2125,      24,       3,
          20724,       7,  107758,       0,     178,      91,       2,   75032,
         113948,     250,   76765,  405587,    7617,       0,       0, 1053572,
          54490,  336646,    1323,  158895,       0,    1571,    6736,    4269,
          12655,    5845,   61793,       0,       0,       0,   40333,       0,
              0,  280411,     995,    4184,    6057,       0,  274506,   29957,
           1930,    3303,     105,   23939,  269740,     829,   70756,  266015,
              0,       0,       0,   16407,   28964])
predicted result is : Sports


In [12]:
print(text)

NameError: name 'text' is not defined

In [17]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/e5/3d/4e983cd98d87b50b2ab0387d73fa946f745aa8164e8888a714d5129f9765/konlpy-0.5.1-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 457kB/s eta 0:00:01
[?25hCollecting JPype1>=0.5.7
[?25l  Downloading https://files.pythonhosted.org/packages/28/63/784834e8a24ec2e1ad7f703c3dc6c6fb372a77cc68a2fdff916e18a4449e/JPype1-0.7.0.tar.gz (470kB)
[K     |████████████████████████████████| 471kB 408kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: JPype1
  Building wheel for JPype1 (setup.py) ... [?25ldone
[?25h  Created wheel for JPype1: filename=JPype1-0.7.0-cp37-cp37m-macosx_10_9_x86_64.whl size=244175 sha256=7a631af331e745b9cadc987a69a1ce04765d43dc4a2387d914105f06d52e8a3e
  Stored in directory: /Users/kihunum/Library/Caches/pip/wheels/68/68/4f/c5f2d175cb26a2765561069a80c4285488d17be01eecb21597
Successfully built JPype1
Installing collected packages: JPype1, konlpy
Successful