In [31]:
import random

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import *
from torchvision import transforms

import nonechucks as nc

from dataset.semcat import SEMCATDataset
from dataset.semcat import OneVsAll, Text8Embedding
from imbalanced_sampler import ImbalancedDatasetSampler
from dnn import DNN

### Configuration

In [2]:
TRAIN_TEST_SPLIT = 0.8
BATCH_SIZE = 4
NUM_EPOCHS = 2
LEARNING_RATE = 0.0001

### Initialize SEMCAT dataset with Embedding transform

In [3]:
embedding = Text8Embedding('model.pkl')
composition = transforms.Compose([
    embedding
])
semcat = SEMCATDataset(transform=composition)
semcat = nc.SafeDataset(semcat)

### Define function to train the classifiers

In [4]:
dataset_size = len(semcat.dataset)
dataset_size

9197

In [5]:
def get_classifier_accuracy(net, test_loader):
    net.eval()
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in test_loader:
            x, y = data['data'], data['category']
            outputs = net(x)
            predicted = round(outputs.data[0])
            total += y.size(0)
            correct += (predicted == y).sum().item()
        return 100 * float(correct) / total

In [6]:
def train_clf_for_category(category):
    # Define net, loss function, and optimizer
    net = DNN()
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)
    
    # This will hold a list of classifiers - one for each SEMCAT category
    classifiers = []
    # List of indices
    dataset_size = len(semcat)
    indices = list(range(dataset_size))
    
    # Shuffle indices
    random.shuffle(indices)
    
    # Initialize training dataloader
    train_indices = indices[:int(TRAIN_TEST_SPLIT * dataset_size)]
    train_sampler = nc.SafeSampler(semcat, SubsetRandomSampler(train_indices))
    train_loader = nc.SafeDataLoader(
        semcat,
        sampler=train_sampler,
        batch_size=BATCH_SIZE
    )
    
    # Train the neural network
    for epoch in range(1):
        running_loss = 0.0
        for i_batch, batched_sample in enumerate(train_loader):
            inputs, labels = batched_sample['data'], batched_sample['category']
            labels = torch.autograd.Variable(labels).float()
            # Zero the parameter gradients
            optimizer.zero_grad()
            # Forward + Backward + Optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
#           print(loss)
            optimizer.step()
            
            # Update running loss
            running_loss += loss.item()
            if i_batch % 100 == 99:
                print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i_batch + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training')
    
    # Initialize test dataloader
    test_indices = indices[int(TRAIN_TEST_SPLIT * dataset_size):]
    test_sampler = nc.SafeSampler(semcat, SubsetRandomSampler(test_indices))
    test_loader = nc.SafeDataLoader(
        semcat,
        sampler=test_sampler
    )
    accuracy = get_classifier_accuracy(net, test_loader)
    print('Test Accuracy: {} %'.format(accuracy))
    
    classifiers.append(net)

### For each SEMCAT category, train a classifier

In [7]:
for category in SEMCATDataset.CATEGORY_FILES.keys():
    print('Training for category: {}'.format(category))

    # Convert multiclass labels into one-vs-all format
    composition.transforms.append(OneVsAll(category))
    train_clf_for_category(category)
    composition.transforms.pop()
    # TODO remove this to train for all categories
    break

Training for category: office
[1,   100] loss: 0.410
[1,   200] loss: 0.308
[1,   300] loss: 0.287
[1,   400] loss: 0.259
[1,   500] loss: 0.265
[1,   600] loss: 0.255
[1,   700] loss: 0.254
[1,   800] loss: 0.247
[1,   900] loss: 0.217
[1,  1000] loss: 0.230
[1,  1100] loss: 0.194
[1,  1200] loss: 0.235
[1,  1300] loss: 0.209
[1,  1400] loss: 0.222
[1,  1500] loss: 0.200
[1,  1600] loss: 0.219
[1,  1700] loss: 0.158
Finished Training
Test Accuracy: 97.1760797342 %


  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


### For each category, get average embedding value for each dimension

In [32]:
office_words_embeddings = np.array([word['data'] for word in semcat if word['category'] == 'office'])

In [28]:
embedding_size = len(office_words_embeddings[0])
embedding_size

200

In [38]:
mean_office_embedding = np.mean(office_words_embeddings, axis=0)