In [1]:
import torch 
import torchtext
from torchtext import data
from torchtext.data import Field
from torchtext.data import Dataset, Example
from torch.utils.data.dataset import random_split
# from torchtext.vocab import 
import os
import gensim
import pandas as pd


### Load Data with NGrams

In [2]:
NGRAMS = 2

tokenizer = lambda x: x.split()

TEXT = Field(sequential=True, tokenize=tokenizer, lower=False)
NEWS_TYPE = Field(sequential=False, use_vocab=False)

fields = {'text': ('t', TEXT), 'type': ('nt', NEWS_TYPE)} 

df_dataset = pd.read_json('dataset_full.json')

df = df_dataset[['text', 'type']]

ltoi = {l: i for i, l in enumerate(df['type'].unique())}
df['type'] = df['type'].apply(lambda y: ltoi[y])

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) for i, r in df.iterrows()
            ],
            fields
            
        )
dataset = DataFrameDataset(df, fields=(('text', TEXT), ('label', NEWS_TYPE)))

train_len = int(len(dataset) * 0.9)
valid_len = int(len(dataset) * 0.05)
sub_train_, sub_valid_, sub_test_ = random_split(dataset, [train_len, valid_len, len(dataset) - train_len - valid_len])

BATCH_SIZE = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Train Word2Vec Embedding

In [3]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

tweets = [row.split() for row in df['text']]

phrases = Phrases(tweets, min_count=2, progress_per=10000)
bigram = Phraser(phrases)

tweets_bigram = bigram[tweets]

w2v_model = Word2Vec(min_count=2,
                    window=5,
                    size=300,
                    sample=6e-5,
                    alpha=0.03,
                    min_alpha=0.0007,
                    negative=20)

w2v_model.build_vocab(tweets_bigram, progress_per=10000)

w2v_model.train(tweets_bigram, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

w2v_model.init_sims(replace=True)


In [6]:
import torch.nn as nn

TEXT.build_vocab(dataset, min_freq=2)

word2vec_vectors = []
for token, idx in TEXT.vocab.stoi.items():
    if token in w2v_model.wv.vocab.keys():
        word2vec_vectors.append(torch.FloatTensor(w2v_model[token]))
    else:
        word2vec_vectors.append(torch.zeros(300))

TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, 300)

pre_trained_emb = torch.FloatTensor(TEXT.vocab.vectors)
embedding = nn.Embedding.from_pretrained(pre_trained_emb)


## The Model

In [7]:
import torch.nn as nn
import torch.nn.functional as F 

class TextSentiment(nn.Module):
    def __init__(self, embedding, embed_dim, num_class):
        super().__init__()
        self.embedding = embedding
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## Initiate Model

In [9]:
EMBED_DIM = 300
NUM_CLASS = 4

model = TextSentiment(embedding, EMBED_DIM, NUM_CLASS).to(device)

## Generate Batch

In [10]:
def generate_batch(batch):
    labels = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)

    return text, offsets, labels

## Train/Test function

In [11]:
from torch.utils.data import DataLoader

def train_func(sub_train_):
    train_loss = 0
    train_acc = 0

    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, 
                        shuffle=True, collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    scheduler.step()

    return train_loss/len(sub_train_), train_acc/len(sub_train_)

def test(data_):
    loss = 0
    acc = 0

    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch) 

    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss/len(data_), acc/len(data_)   

### Split Dataset and Train the Model

In [16]:
import time
from torchtext.data import BucketIterator

N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

# NEWS_TYPE.build_vocab(dataset, min_freq=2)

train_iter, valid_iter, test_iter = BucketIterator.splits(
    datasets=(sub_train_, sub_valid_, sub_test_), 
    batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_key=None
    sort=False
)

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(train_iter)
    valid_loss, valid_acc = test(valid_iter)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), ' | time in %d minutes, %d seconds' %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

AttributeError: 'Subset' object has no attribute 'sort_key'

### Evaluate on Test Dataset

In [13]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_iter)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 89.5%(test)


### Test on random news

In [16]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer('basic_english')
    with torch.no_grad():
        text = torch.tensor([vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))

        return output.argmax(1).item() + 1

vocab = train_dataset.get_vocab()
model = model.to("cpu")

text_str = 'Massive explosions of energy happening thousands of light-years from Earth may have left traces in our planet\'s biology and geology, according to new research by University of Colorado Boulder geoscientist Robert Brakenridge. The study, published this month in the International Journal of Astrobiology, probes the impacts of supernovas, some of the most violent events in the known universe. In the span of just a few months, a single one of these eruptions can release as much energy as the sun will during its entire lifetime. They\'re also bright -- really bright. "We see supernovas in other galaxies all the time," said Brakenridge, a senior research associate at the Institute of Arctic and Alpine Research (INSTAAR) at CU Boulder. "Through a telescope, a galaxy is a little misty spot. Then, all of a sudden, a star appears and may be as bright as the rest of the galaxy." A very nearby supernova could be capable of wiping human civilization off the face of the Earth. But even from farther away, these explosions may still take a toll, Brakenridge said, bathing our planet in dangerous radiation and damaging its protective ozone layer. To study those possible impacts, Brakenridge searched through the planet\'s tree ring records for the fingerprints of these distant, cosmic explosions. His findings suggest that relatively close supernovas could theoretically have triggered at least four disruptions to Earth\'s climate over the last 40,000 years. The results are far from conclusive, but they offer tantalizing hints that, when it comes to the stability of life on Earth, what happens in space doesn\'t always stay in space.'

print("This is a %s news" %ag_news_label[predict(text_str, model, vocab, 2)])

This is a Sci/Tec news
