In [1]:
from os import path
import json
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.optim.lr_scheduler import ExponentialLR
import matplotlib.pyplot as plt

base = "/content/drive/MyDrive/NLP100/ch09"

fp_train = "80/train.csv"
fp_words = "80/word_ids.json"
df_train = pd.read_csv(path.join(base, fp_train), index_col=0)
word_ids = json.load(open(path.join(base, fp_words), "r"))
df_train.head()

Unnamed: 0_level_0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
311098,RPT-Fitch Updates EMEA Consumer ABS Rating Cri...,http://in.reuters.com/article/2014/06/24/fitch...,Reuters,0,d3tIMfB2mg-9MZM4G_jGTEiRVl3jM,in.reuters.com,1403633888597
49198,"Gurlitt Wants to Return Nazi-Looted Art, Suedd...",http://www.businessweek.com/news/2014-03-26/gu...,Businessweek,2,dWeC4g3bvX-bI6MKAcxt43Jp7MzSM,www.businessweek.com,1395886231417
410578,"UPDATE 1-Fairfax Financial, CEO probed over po...",http://in.reuters.com/article/2014/08/01/fairf...,Reuters,0,d6s61cXf9rASExMO8inpkRujc9VTM,in.reuters.com,1406929609070
270668,Angelina Jolie - Angelina Jolie Will Not Tight...,http://www.contactmusic.com/story/angelina-jol...,Contactmusic.com,2,d7SbUKanRaD34YMwE64lo09d_qCnM,www.contactmusic.com,1401828902205
301656,Patent Officials Cancel the Washington Redskin...,http://www.businessweek.com/articles/2014-06-1...,Businessweek,0,dXVU3KhKpHSxleMNmc46pQj6e5zwM,www.businessweek.com,1403113194649


In [2]:
num_words_of_title = max([len(title.split()) for title in df_train["TITLE"]])


def title_to_ids(t):
  res = [0 for _ in range(num_words_of_title)]
  for i, w in enumerate(t.split()):
    if w in word_ids.keys():
      res[i] = word_ids[w]
  return res


title_to_ids("Europe reaches crunch point on banking union")[:5]

[252, 2031, 0, 1500, 4]

In [3]:
X_train = torch.tensor([title_to_ids(title) for title in df_train["TITLE"]])
y_train = torch.tensor(df_train["CATEGORY"].values.astype("int"))
print(X_train[:5])

tensor([[ 599, 3939,    0,  ...,    0,    0,    0],
        [6646,  515,    1,  ...,    0,    0,    0],
        [   7,    0, 6648,  ...,    0,    0,    0],
        [ 326,  419,    6,  ...,    0,    0,    0],
        [1521, 1382, 2762,  ...,    0,    0,    0]])


In [4]:
batch_size = 1
num_workers = 2
dataset_train = [(X_i, y_i) for X_i, y_i in zip(X_train, y_train)]
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [5]:
num_embeddings = max(word_ids.values()) + 1
embedding_dim = 300
hidden_size = 50


class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
    self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
    self.linear = nn.Linear(hidden_size, 4)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    hidden = torch.zeros(1, x.size(0), hidden_size).to(device)
    y = self.embedding(x)
    y, hidden = self.rnn(y, hidden)
    y = self.linear(y[:, -1, :])
    y = self.softmax(y)
    return y

if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"
print(f"device: {device}")
model = Model().to(device)
print(model)

device: cpu
Model(
  (embedding): Embedding(10272, 300, padding_idx=0)
  (rnn): RNN(300, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)


In [6]:
for batch, (inputs, targets) in enumerate(dataloader_train):
  if batch < 5:
    print(batch, model(inputs))

0 tensor([[0.2516, 0.2632, 0.2404, 0.2448]], grad_fn=<SoftmaxBackward>)
1 tensor([[0.2516, 0.2632, 0.2404, 0.2448]], grad_fn=<SoftmaxBackward>)
2 tensor([[0.2516, 0.2632, 0.2404, 0.2448]], grad_fn=<SoftmaxBackward>)
3 tensor([[0.2516, 0.2632, 0.2404, 0.2448]], grad_fn=<SoftmaxBackward>)
4 tensor([[0.2516, 0.2632, 0.2404, 0.2448]], grad_fn=<SoftmaxBackward>)
