# Import Libraries

In [21]:
# Model inspired by https://qiita.com/takeshikondo/items/419bebc4f9e6c78d5ea9
# PyTorch code by fmireshg@eng.ucsd.edu
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
from Model_20 import Model_20

import torch.nn as nn
from collections import OrderedDict

from tqdm import tqdm

from sklearn.datasets import fetch_20newsgroups

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

# Get the data

In [2]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, 
                                      categories=categories,)

print (newsgroups_train.target_names)
print (len(newsgroups_train.data))
print("    ***************  ")
#print (newsgroups_train.data[1])
print((newsgroups_train.data[1]))
print((categories[newsgroups_train.target[1]]))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
11314
    ***************  
From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day,

# Process data

In [3]:
texts = []

labels=newsgroups_train.target
texts = newsgroups_train.data

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

print (sequences[0][:10])

[14, 4308, 1350, 15, 11126, 38, 250, 29, 42, 298]


In [4]:
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

#print(word_index.items())


Found 134142 unique tokens.


In [5]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(type(data))
print (data.shape)
print (data[0][-50:])

<class 'numpy.ndarray'>
(11314, 1000)
[  26 1835   14    1  816    3    1  726   17    9   44    8   88   27
  171   39    4  828  273 1078 2908  198    3 2804  153   17  298    9
  239  628   25  808  357   13   21   16   17  384  298  181  112  188
  206 1498 1341    2   13   35   58 7860]


In [6]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (11314, 1000)
Shape of label tensor: (11314,)


# Validation to Training ratio

In [7]:
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
print (indices)


[    0     1     2 ... 11311 11312 11313]


In [8]:
np.random.shuffle(indices) 
print(indices)


[ 6471  5451  6779 ... 10635  7867  1272]


In [9]:
data = data[indices] 
labels = labels[indices] 
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples] 
y_train = labels[:-nb_validation_samples] 
x_val = data[-nb_validation_samples:] 
y_val = labels[-nb_validation_samples:] 

print (x_train.shape)
print (y_train.shape)


(9052, 1000)
(9052,)


# Embedding

Download the pretrained embeddings from: https://www.kaggle.com/terenceliu4444/glove6b100dtxt

To learn more about embeddings, have a look at https://towardsdatascience.com/neural-network-embeddings-explained-4d028e6f0526


In [13]:

embeddings_index = {}

path = '/home/niloofar/'

f = open(path+'glove.6B.100d.txt')
for line in f:
    values = line.split(' ')
    word = values[0]
    #values[-1] = values[-1].replace('\n', '')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    #print (values[1:])
f.close()
 
print ()
print ('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [14]:


EMBEDDING_DIM = 100

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    #embedding_vector = embeddings_index[word]
    if embedding_vector is not None:
    # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print (embedding_matrix.shape)
print (type(embedding_matrix))
print (embedding_matrix[0][:10])

(134143, 100)
<class 'numpy.ndarray'>
[0.93803958 0.71944306 0.34604118 0.01297138 0.03485201 0.71182131
 0.57334824 0.88424407 0.71130049 0.6870206 ]


In [15]:
class Model_20(nn.Module):

    def __init__(self, vocab_size, dim, embeddings):
        super(Model_20, self).__init__()
        self.vocab_size = vocab_size 
        self.dim = dim
        self.embedding = nn.Embedding(self.vocab_size, self.dim)
        self.convnet = nn.Sequential(OrderedDict([
            #('embed1', nn.Embedding(self.vocab_size, self.dim)),
            ('c1', nn.ConvTranspose1d(100, 128, 5)),
            ('relu1', nn.ReLU()),
            ('maxpool1', nn.MaxPool1d(5)),
            ('c2', nn.Conv1d(128, 128, 5)),
            ('relu2', nn.ReLU()),
            ('maxpool2', nn.MaxPool1d(5)),
            ('c3', nn.Conv1d(128, 128, 5)),
            ('relu3', nn.ReLU()),
            ('maxpool3', nn.MaxPool1d(35)),
        ]))
    
        self.embedding.weight = nn.Parameter(torch.FloatTensor(embeddings))
        #copy_((embeddings))
        self.embedding.weight.requires_grad = False
    
        self.fc = nn.Sequential(OrderedDict([
            ('f4', nn.Linear(128, 128)),
            ('relu4', nn.ReLU()),
            ('f5', nn.Linear(128, 20)),
            ('sig5', nn.LogSoftmax(dim=-1))
        ]))

    def forward(self, img):
        
        output = self.embedding(img)
        output.transpose_(1,2)
        output = self.convnet(output)
        output = output.view(img.size(0), -1)
        output = self.fc(output)
        
        return output


In [23]:
net = Model_20(embedding_matrix.shape[0], EMBEDDING_DIM, embedding_matrix)

criterion = nn.CrossEntropyLoss()

In [24]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.0001)

In [25]:
def train(epoch):

    for i in range (0, x_train.shape[0], 128):

        optimizer.zero_grad()

        output = net(torch.LongTensor(x_train[i:i+128, :]))
        #print(torch.LongTensor(y_train[i:i+128]).shape)
        loss = criterion(output, torch.LongTensor(y_train[i:i+128]))


        print(loss)
        print ("____________________")
        #print(net.embedding.weight)
        
        loss.backward()
        optimizer.step()
       

In [26]:
def test():
    net.eval()
    total_correct = 0
    avg_loss = 0.0
    for i in range (0, x_val.shape[0], 128):

        output = net(torch.LongTensor(x_val[i:i+128, :]))
        #avg_loss += criterion(output, y_val[i:i+128]).sum()
        pred = output.detach().max(1)[1]
        total_correct += pred.eq(torch.LongTensor(y_val[i:i+128]).view_as(pred)).sum()

    #avg_loss /= len(data_test)
    print('Test Avg. Loss: %f, Accuracy: %f' % (avg_loss, float(total_correct) / x_val.shape[0]))



In [28]:
for i in range(1):
    train(i)
    

tensor(2.9966, grad_fn=<NllLossBackward>)
____________________
tensor(2.9954, grad_fn=<NllLossBackward>)
____________________
tensor(2.9960, grad_fn=<NllLossBackward>)
____________________
tensor(2.9935, grad_fn=<NllLossBackward>)
____________________
tensor(2.9948, grad_fn=<NllLossBackward>)
____________________
tensor(2.9969, grad_fn=<NllLossBackward>)
____________________
tensor(3.0014, grad_fn=<NllLossBackward>)
____________________
tensor(2.9939, grad_fn=<NllLossBackward>)
____________________
tensor(2.9951, grad_fn=<NllLossBackward>)
____________________
tensor(2.9940, grad_fn=<NllLossBackward>)
____________________
tensor(2.9940, grad_fn=<NllLossBackward>)
____________________
tensor(2.9877, grad_fn=<NllLossBackward>)
____________________
tensor(2.9888, grad_fn=<NllLossBackward>)
____________________
tensor(2.9968, grad_fn=<NllLossBackward>)
____________________
tensor(2.9972, grad_fn=<NllLossBackward>)
____________________
tensor(2.9901, grad_fn=<NllLossBackward>)
_____________

# An already traine

In [22]:

net = Model_20(embedding_matrix.shape[0], EMBEDDING_DIM, embedding_matrix)

net.load_state_dict(torch.load("20newsgroups-fixed"))

In [None]:
test() 