Самостоятельно обучить классификатор текстов на примере 20 newsgroups. 
На примере 20 newsgroups попробовать параметры сверток для классификации текстов.

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import numpy as np
from collections import Counter

In [None]:
from posixpath import split
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')


In [None]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
newsgroups_train.filenames.shape

(11314,)

In [None]:
from numpy import vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
 
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 130107)

In [None]:
from functools import lru_cache

class NewsgroupsDataset(torch.utils.data.Dataset):

  def __init__(self, txts, labels, w2index, used_length):
    self._txts = txts
    self._labels = labels
    self._w2index = w2index
    self._used_length = used_length
    
  def __len__(self):
    return len(self._txts)
    
  @lru_cache(50000) 
  def encode_sentence(self, txt):
    encoded = np.zeros(self._length, dtype=int)
    enc1 = np.array([self._w2index.get(word, self._w2index["UNK"]) for word in txt.split()])
    length = min(self._length, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

  def __getitem__(self, index):
    encoded, length = self.encoded_sentence(self._txts[index])
    return torch.from_numpy(encoded.astype(np.int32)), self._labels[index], length

   

In [None]:
class OurNet(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super().__init__()
    self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
    self.relu = nn.ReLU()
    self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
    self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

  def forward(self, x):
    out = self.layer_1(x)
    out = self.relu(out)
    out = self.layer_2(out)
    out = self.relu(out)
    out = self.output_layer(out)
    return out

In [None]:
vocab = Counter()

for text in newsgroups_train.data:
  for word in text.split(' '):
    vocab[word.lower()]+=1

for text in newsgroups_test.data:
  for word in text.split(' '):
    vocab[word.lower()]+=1

print("Total words:", len(vocab))

Total words: 591946


In [None]:
total_words = len(vocab)

def get_word_2_index(vocab):
  word2index = {}
  for i, word in enumerate(vocab):
    word2index[word.lower()] = i

  return word2index

word2index = get_word_2_index(vocab)

print("Index of the word 'the':", word2index['the'])

Index of the word 'the': 40


In [None]:
def text_to_vector(text):
  layer = np.zeros(total_words, dtype=float)
  for word in text.split(' '):
    layer[word2index[word.lower()]] += 1

  return layer

In [None]:
def category_to_vector(category):
  y = np.zeros((3), dtype=float)
  if category == 0:
    y[0] = 1.
  elif category == 1:
    y[1] = 1.
  else:
    y[2] = 1.

  return y

In [None]:
def get_batch(df, i, batch_size):
  batches = []
  results = []
  texts = df.data[i*batch_size: i*batch_size + batch_size]
  categories = df.target[i*batch_size: i*batch_size+batch_size]

  for text in texts:
    layer = text_to_vector(text)
    batches.append(layer)

  for category in categories:
    y = category_to_vector(category)
    results.append(y)

  return np.array(batches), np.array(results)


print("Each batch has 100 texts and each matrix has 591946 elements (words):", get_batch(newsgroups_train, 1, 100)[0].shape)

Each batch has 100 texts and each matrix has 591946 elements (words): (100, 591946)


In [None]:
input_size= 11314
hidden_size = 128
num_classes = 20
learning_rate = 0.01

net = OurNet(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [None]:
# Train
num_epochs = 5
batch_size = 100
for epoch in range(num_epochs):
  total_batch = int(len(newsgroups_train.data)/batch_size)
  for i in range(total_batch):
    batch_x, batch_y = get_batch(newsgroups_train, i, batch_size)
    articles = Variable(torch.FloatTensor(batch_x))
    labels = Variable(torch.FloatTensor(batch_y))

    # Forward + Backward + Optimize
    optimizer.zero_grad() # zero the gradient buffer
    outputs = net(articles)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'(epoch+1, num_epochs, i+1, len(newsgroups_train.data)//batch_size, loss.data[0]))