In [258]:
import numpy as np
import unicodedata
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.autograd import Variable


In [2]:
import spacy
nlp = spacy.load('en')

In [200]:
num_classes = 5

glove_file = 'glove/glove.6B.50d.txt'
glove_dim = 50
#glove_file = 'glove/glove.840B.300d.txt'
#glove_dim = 300
use_lemmatization = False

cuda = False

In [4]:
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

def load_glove_vocab(wv_file, wv_dim):
    vocab = set()
    with open(wv_file, encoding="utf8") as f:
        for line in f:
            elems = line.split()
            token = normalize_text(''.join(elems[0:-wv_dim]))
            vocab.add(token)
    return vocab

def build_embedding(wv_file, wv_dim, target_vocab):
    vocab_size = len(target_vocab)
    emb = np.random.uniform(-1, 1, (vocab_size, wv_dim))
    emb[0] = 0 # <PAD> should be all 0 (using broadcast)

    w2id = {w: i for i, w in enumerate(target_vocab)}
    with open(wv_file, encoding="utf8") as f:
        for line in f:
            elems = line.split()
            token = normalize_text(''.join(elems[0:-wv_dim]))
            if token == '<PAD>':
                print(token)
            if token in w2id:
                emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]]
    return emb

def token2id(docs, vocab, unk_id=None):
    w2id = {w: i for i, w in enumerate(vocab)}
    ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs]
    return ids

In [5]:
glove_vocab = load_glove_vocab(glove_file, glove_dim) 
#print('glove loaded.')

In [311]:
docs_train = []
#docs_raw.append('Based on fully-aware attention, we propose an end-to-end architecture qaz123')
#docs_raw.append('Teaching machines to read, process and comprehend text and then answer questions is one of key problems in artificial intelligence')
docs_train.append('I am proud of your achievements')
docs_train

['I am proud of your achievements']

In [229]:
df_train = pd.read_csv('emoji/train_emoji.csv', header=None)
docs_train = df_train[0]
Y_train = Variable(torch.LongTensor(df_train[1].values), requires_grad=False)
df_train

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,
5,I love you mum,0,,
6,Stop saying bullshit,3,,
7,congratulations on your acceptance,2,,
8,The assignment is too long,3,,
9,I want to go play,1,,[3]


In [230]:
df_test = pd.read_csv('emoji/tesss.csv', header=None)
docs_test = df_test[0]
Y_test = Variable(torch.LongTensor(df_test[1].values), requires_grad=False)
df_test

Unnamed: 0,0,1
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a very nice raise\t,2
3,she got me a nice present\t,2
4,ha ha ha it was so funny\t,2
5,he is a good friend\t,2
6,I am upset\t,3
7,We had such a lovely dinner tonight\t,2
8,where is the food\t,4
9,Stop making this joke ha ha ha\t,2


In [8]:
nlp_doc = nlp(docs_train[0])
for token in nlp_doc:
    if not token.is_punct and not token.is_space:
        print(token.text, token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ , token.pos_, token.ent_type_, token.ent_iob_)


never never ADV  O
talk talk VERB  O
to to ADP  O
me me PRON  O
again again ADV  O


In [9]:
docs_train_tokens = []
for doc in docs_train:
    nlp_doc = nlp(doc)
    #keep stop words 
    #spacy doesn't have stemming, only lemmatization
    if use_lemmatization:
        tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    else:
        tokens = [token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    docs_train_tokens.append(tokens)

docs_train_tokens

[['never', 'talk', 'to', 'me', 'again'],
 ['i', 'am', 'proud', 'of', 'your', 'achievements'],
 ['it', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'],
 ['miss', 'you', 'so', 'much'],
 ['food', 'is', 'life'],
 ['i', 'love', 'you', 'mum'],
 ['stop', 'saying', 'bullshit'],
 ['congratulations', 'on', 'your', 'acceptance'],
 ['the', 'assignment', 'is', 'too', 'long'],
 ['i', 'want', 'to', 'go', 'play'],
 ['she', 'did', 'not', 'answer', 'my', 'text'],
 ['your', 'stupidity', 'has', 'no', 'limit'],
 ['how', 'many', 'points', 'did', 'he', 'score'],
 ['my', 'algorithm', 'performs', 'poorly'],
 ['i', 'got', 'approved'],
 ['stop', 'shouting', 'at', 'me'],
 ['sounds', 'like', 'a', 'fun', 'plan', 'ha', 'ha'],
 ['no', 'one', 'likes', 'him'],
 ['the', 'game', 'just', 'finished'],
 ['i', 'will', 'celebrate', 'soon'],
 ['so', 'sad', 'you', 'are', 'not', 'coming'],
 ['she', 'is', 'my', 'dearest', 'love'],
 ['good', 'job'],
 ['it', 'was', 'funny', 'lol'],
 ['candy', 'is', 'life'],
 ['the', 'chicago', 'cu

In [10]:
docs_test_tokens = []
for doc in docs_test:
    nlp_doc = nlp(doc)
    if use_lemmatization:
        tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    else:
        tokens = [token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    docs_test_tokens.append(tokens)

docs_test_tokens

[['i', 'want', 'to', 'eat'],
 ['he', 'did', 'not', 'answer'],
 ['he', 'got', 'a', 'very', 'nice', 'raise'],
 ['she', 'got', 'me', 'a', 'nice', 'present'],
 ['ha', 'ha', 'ha', 'it', 'was', 'so', 'funny'],
 ['he', 'is', 'a', 'good', 'friend'],
 ['i', 'am', 'upset'],
 ['we', 'had', 'such', 'a', 'lovely', 'dinner', 'tonight'],
 ['where', 'is', 'the', 'food'],
 ['stop', 'making', 'this', 'joke', 'ha', 'ha', 'ha'],
 ['where', 'is', 'the', 'ball'],
 ['work', 'is', 'hard'],
 ['this', 'girl', 'is', 'messing', 'with', 'me'],
 ['are', 'you', 'serious'],
 ['let', 'us', 'go', 'play', 'baseball'],
 ['this', 'stupid', 'grader', 'is', 'not', 'working'],
 ['work', 'is', 'horrible'],
 ['congratulation', 'for', 'having', 'a', 'baby'],
 ['stop', 'pissing', 'me', 'off'],
 ['any', 'suggestions', 'for', 'dinner'],
 ['i', 'love', 'taking', 'breaks'],
 ['you', 'brighten', 'my', 'day'],
 ['i', 'boiled', 'rice'],
 ['she', 'is', 'a', 'bully'],
 ['why', 'are', 'you', 'feeling', 'bad'],
 ['i', 'am', 'upset'],
 ['gi

In [11]:
all_docs = docs_train_tokens + docs_test_tokens
all_docs

[['never', 'talk', 'to', 'me', 'again'],
 ['i', 'am', 'proud', 'of', 'your', 'achievements'],
 ['it', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'],
 ['miss', 'you', 'so', 'much'],
 ['food', 'is', 'life'],
 ['i', 'love', 'you', 'mum'],
 ['stop', 'saying', 'bullshit'],
 ['congratulations', 'on', 'your', 'acceptance'],
 ['the', 'assignment', 'is', 'too', 'long'],
 ['i', 'want', 'to', 'go', 'play'],
 ['she', 'did', 'not', 'answer', 'my', 'text'],
 ['your', 'stupidity', 'has', 'no', 'limit'],
 ['how', 'many', 'points', 'did', 'he', 'score'],
 ['my', 'algorithm', 'performs', 'poorly'],
 ['i', 'got', 'approved'],
 ['stop', 'shouting', 'at', 'me'],
 ['sounds', 'like', 'a', 'fun', 'plan', 'ha', 'ha'],
 ['no', 'one', 'likes', 'him'],
 ['the', 'game', 'just', 'finished'],
 ['i', 'will', 'celebrate', 'soon'],
 ['so', 'sad', 'you', 'are', 'not', 'coming'],
 ['she', 'is', 'my', 'dearest', 'love'],
 ['good', 'job'],
 ['it', 'was', 'funny', 'lol'],
 ['candy', 'is', 'life'],
 ['the', 'chicago', 'cu

In [12]:
#ignore words not in glove vocabulary
all_docs = docs_train_tokens + docs_test_tokens
vocab = list(set([word for doc in all_docs for word in doc if word in glove_vocab]))
vocab_ignored = set([word for doc in all_docs for word in doc if word not in glove_vocab])
vocab.insert(0, "<PAD>")
vocab.insert(1, "<UNK>")
print(vocab)
print(vocab_ignored)

['<PAD>', '<UNK>', 'brighten', 'congrats', 'was', 'the', 'grade', 'sushi', 'an', 'bullshit', 'she', 'shouting', 'chicago', 'we', 'her', 'kind', 'final', 'new', 'awesome', 'exercise', 'lovely', 'not', 'hard', 'exam', 'catcher', 'eat', 'traction', 'did', 'brunch', 'ball', 'hours', 'sounds', 'attractive', 'yesterday', 'stop', 'moment', 'enjoy', 'bully', 'raise', 'amazing', 'man', 'joke', 'dance', 'down', 'stupid', 'you', 'algorithm', 'smile', 'waiting', 'finished', 'is', 'pizza', 'code', 'horrible', 'score', 'when', 'a', 'ha', 'happy', 'lot', 'up', 'sucks', 'after', 'dear', 'date', 'rules', 'proud', 'two', 'grader', 'killing', 'humiliated', 'talented', 'have', 'cookies', 'together', 'back', 'zero', 'saying', 'adorable', 'bravo', 'one', 'dare', 'project', 'again', 'us', 'takes', 'much', 'home', 'my', 'excited', 'soon', 'with', 'poorly', 'him', 'people', 'can', 'give', 'limit', 'breakfast', 'for', 'cute', 'were', 'midterm', 'worst', 'impressed', 'tasty', 'baby', 'playing', 'but', 'of', 'lik

In [13]:
len(vocab)

313

In [14]:
docs_train_ids = token2id(docs_train_tokens, vocab, unk_id=1)
docs_train_ids

[[150, 139, 182, 244, 83],
 [282, 266, 66, 109, 151, 159],
 [132, 50, 5, 103, 242, 164, 88, 155],
 [124, 45, 264, 86],
 [211, 50, 155],
 [282, 199, 45, 200],
 [34, 77, 9],
 [228, 241, 151, 207],
 [5, 240, 50, 161, 177],
 [282, 251, 182, 224, 255],
 [10, 27, 21, 183, 88, 254],
 [151, 310, 311, 126, 97],
 [197, 268, 120, 27, 247, 54],
 [88, 46, 301, 92],
 [282, 312, 250],
 [34, 11, 138, 244],
 [31, 110, 56, 115, 235, 57, 57],
 [126, 80, 236, 93],
 [5, 146, 167, 49],
 [282, 131, 135, 90],
 [264, 190, 45, 160, 21, 272],
 [10, 50, 88, 169, 199],
 [261, 187],
 [132, 4, 271, 130],
 [215, 50, 155],
 [5, 12, 194, 141, 83],
 [282, 266, 299],
 [282, 266, 264, 89, 182, 225, 45, 62, 264, 177],
 [45, 27, 243, 241, 45, 23],
 [148, 28, 136, 242],
 [247, 50, 264, 100],
 [197, 81, 45, 213, 294],
 [198, 45, 251, 182, 295, 244, 99, 127],
 [282, 121, 201],
 [10, 50, 32],
 [45, 245],
 [10, 231, 56, 59],
 [247, 50, 179],
 [10, 85, 181, 182, 184, 278],
 [233, 178, 50, 264, 105],
 [13, 202, 132],
 [282, 266, 8

In [15]:
docs_test_ids = token2id(docs_test_tokens, vocab, unk_id=1)
docs_test_ids

[[282, 251, 182, 25],
 [247, 27, 21, 183],
 [247, 312, 56, 248, 218, 38],
 [10, 312, 244, 56, 218, 286],
 [57, 57, 57, 132, 4, 264, 271],
 [247, 50, 56, 261, 308],
 [282, 266, 298],
 [13, 305, 304, 56, 20, 127, 288],
 [209, 50, 5, 211],
 [34, 229, 142, 41, 57, 57, 57],
 [209, 50, 5, 29],
 [173, 50, 22],
 [142, 147, 50, 118, 91, 244],
 [160, 45, 265],
 [262, 84, 224, 255, 289],
 [142, 44, 68, 50, 21, 256],
 [173, 50, 53],
 [175, 99, 234, 56, 106],
 [34, 112, 244, 158],
 [133, 170, 99, 127],
 [282, 199, 149, 226],
 [45, 2, 88, 242],
 [282, 269, 180],
 [10, 50, 56, 37],
 [253, 160, 45, 113, 174],
 [282, 266, 298],
 [96, 244, 5, 29],
 [88, 239, 50, 5, 199, 109, 88, 155],
 [36, 151, 146],
 [196, 242, 50, 263],
 [282, 124, 45, 264, 86],
 [166, 5, 29],
 [88, 155, 50, 264, 129],
 [10, 121, 201],
 [131, 45, 237, 88, 196],
 [247, 95, 117, 221, 243],
 [42, 91, 244],
 [282, 266, 299],
 [225, 45, 138, 5, 220],
 [282, 110, 182, 232],
 [282, 131, 188],
 [282, 110, 151, 176],
 [282, 124, 14],
 [246, 5

In [16]:
embeddings = build_embedding(glove_file, glove_dim, vocab)
embeddings
#log.info('got embedding matrix for training.')

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.38919181,  0.66708405,  0.72470655, ..., -0.79198276,
         0.92856789,  0.8332264 ],
       [ 0.34118   ,  0.13465   , -0.056107  , ..., -0.71771   ,
         0.40332   , -0.038861  ],
       ..., 
       [-0.49179   , -0.12295   , -0.50197   , ...,  0.36601   ,
         0.43279   , -0.26038   ],
       [ 0.54822   ,  0.038847  ,  0.10127   , ...,  0.26588   ,
        -0.40267   , -0.17111   ],
       [-0.4097    , -0.37167   ,  0.38852   , ..., -0.25414   ,
         0.040372  ,  0.38652   ]])

In [17]:
embeddings[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [None]:
#TODO (preprocess): save vocab, embeddings in metadata file, docs_ids in csv file

In [249]:
X_train = torch.LongTensor (len(docs_train_ids), max([len(doc) for doc in docs_train_ids])).fill_(0)
X_train_mask = torch.LongTensor (len(docs_train_ids)).fill_(0)
for index, doc in enumerate(docs_train_ids):
    X_train[index, :len(doc)] = torch.LongTensor(doc)
    X_train_mask[index] = len(doc)
    
X_train_mask = X_train_mask.unsqueeze(1)

print(X_train)
print(X_train_mask)


  150   139   182  ...      0     0     0
  282   266    66  ...      0     0     0
  132    50     5  ...    155     0     0
       ...          ⋱          ...       
  246    50   192  ...      0     0     0
  282   199    45  ...      0     0     0
  258   187     0  ...      0     0     0
[torch.LongTensor of size 132x10]


    5
    6
    8
    4
    3
    4
    3
    4
    5
    5
    6
    5
    6
    4
    3
    4
    7
    4
    4
    4
    6
    5
    2
    4
    3
    5
    3
   10
    6
    4
    4
    5
    8
    3
    3
    2
    4
    3
    6
    5
    3
    3
    4
    1
    4
    4
   10
    6
    4
    5
    3
    7
    6
    3
    3
    4
    2
    5
    6
    6
    4
    5
    5
    3
    7
    3
    4
    3
    5
   10
    3
    5
    4
    8
    1
    6
    3
    6
    9
    3
   10
    6
    1
    4
    7
    4
    5
    1
    4
    4
    4
    6
    3
    4
    4
    7
    8
    6
    3
    7
    4
    6
    2
    4
    5
    6
    5
    6
    4
    3
    7
   

In [250]:
X_test = torch.LongTensor (len(docs_test_ids), max([len(doc) for doc in docs_test_ids])).fill_(0)
X_test_mask = torch.LongTensor (len(docs_test_ids)).fill_(0)
for index, doc in enumerate(docs_test_ids):
    X_test[index, :len(doc)] = torch.LongTensor (doc)
    X_test_mask[index] = len(doc)
    
X_test_mask = X_test_mask.unsqueeze(1)

print(X_test)
print(X_test_mask)


  282   251   182    25     0     0     0     0
  247    27    21   183     0     0     0     0
  247   312    56   248   218    38     0     0
   10   312   244    56   218   286     0     0
   57    57    57   132     4   264   271     0
  247    50    56   261   308     0     0     0
  282   266   298     0     0     0     0     0
   13   305   304    56    20   127   288     0
  209    50     5   211     0     0     0     0
   34   229   142    41    57    57    57     0
  209    50     5    29     0     0     0     0
  173    50    22     0     0     0     0     0
  142   147    50   118    91   244     0     0
  160    45   265     0     0     0     0     0
  262    84   224   255   289     0     0     0
  142    44    68    50    21   256     0     0
  173    50    53     0     0     0     0     0
  175    99   234    56   106     0     0     0
   34   112   244   158     0     0     0     0
  133   170    99   127     0     0     0     0
  282   199   149   226     0     0    

In [191]:
if cuda:
    X_train = X_train.cuda()
    X_train_mask = X_train_mask.cuda()
    Y_train = Y_train.cuda()

    X_test = X_test.cuda()
    X_test_mask = X_test_mask.cuda()
    Y_test = Y_test.cuda()

In [251]:
print(type(X_train))
print(type(X_train_mask))
print(X_train.size())
print(X_train_mask.size())

<class 'torch.LongTensor'>
<class 'torch.LongTensor'>
torch.Size([132, 10])
torch.Size([132, 1])


In [260]:
test1 = torch.cat((X_train, X_train_mask), 1)
test2 = torch.cat((X_test, X_test_mask), 1)
#dataset = torch.utils.data.TensorDataset(torch.cat((X_train, X_train_mask), 1), torch.cat((X_test, X_test_mask), 1))
dataset = data_utils.TensorDataset(test1, Y_train)
dataset

<torch.utils.data.dataset.TensorDataset at 0x209c1181898>

In [171]:
seed = 0
#random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)
#TODO: if load model, synchronize random seed

In [201]:
#word embeddings, average, 
class Model1_LR(nn.Module):
    def __init__(self, vocab, embeddings, num_classes):
        super(Model1_LR, self).__init__()    
    
        self.num_classes = num_classes
        
        #embedding layer
        self.embedding_dim = len(embeddings[0])
        self.embedding = nn.Embedding(len(vocab),         #vocab size
                                      self.embedding_dim, #embedding_dim
                                      padding_idx=0)
        self.embedding.weight.data = torch.Tensor(embeddings)
        #do not backprop into embeddings
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        #linear layer
        self.linear = nn.Linear(self.embedding_dim, num_classes)
        #nn.init.xavier_normal(self.linear.weight)
        #self.linear.bias.data.zero_()
     
    def forward(self, X, X_mask):
        #X: [m, Tx] m = batch size, Tx = word count
        #print(X.size(), type(X))
        m = X.size()[0]
        Tx = X.size()[1]
        
        X = self.embedding(X)
        #X: [m, Tx, embedding_dim] m = batch size, Tx = word count
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.embedding_dim])
                
        #average words in doc. use mask so we average only words not padding
        X = torch.sum(X, 1)
        X = Variable(torch.div(X.data, X_mask))
        #X: [m, emb_dim]
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, self.embedding_dim])
        
        X = self.linear(X)
        #X: [m, 1]
        #print(X.size(), type(X))
        assert X.size() == torch.Size([m, self.num_classes])
        
        return F.softmax(X)

In [225]:
model = Model1_LR(vocab, embeddings, num_classes)
if cuda:
    model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-2)
epoch = 0

In [135]:
class Model2_LSTM(nn.Module):
    def __init__(self, vocab, embeddings, num_classes):
        super(Model2_LSTM, self).__init__()    
    
        self.num_classes = num_classes
        
        #embedding layer
        self.embedding_dim = len(embeddings[0])
        self.embedding = nn.Embedding(len(vocab),         #vocab size
                                      self.embedding_dim, #embedding_dim
                                      padding_idx=0)
        self.embedding.weight.data = torch.Tensor(embeddings)
        #do not backprop into embeddings
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        #LSTM1, hidden_size = 128
        #TODO: try bidirectional=True
        self.LSTM1_hidden_size = 128
        self.LSTM1 = nn.LSTM(self.embedding_dim, self.LSTM1_hidden_size)
        
        #dropout
        self.dropout = nn.Dropout()
     
        #LSTM, hidden_size = 128
        #TODO: try bidirectional=True
        self.LSTM2_hidden_size = 128
        self.LSTM2 = nn.LSTM(self.LSTM1_hidden_size, self.LSTM2_hidden_size)
        
        #linear layer
        self.linear = nn.Linear(self.LSTM2_hidden_size, num_classes)

    def forward(self, X, X_mask):
        #X: [m, Tx] m = batch size, Tx = word count
        #print(X.size(), type(X))
        m = X.size()[0]
        Tx = X.size()[1]

        #embedding layer
        X = self.embedding(X)
        #X: [m, Tx, embedding_dim] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.embedding_dim])
           
        #LSTM1
        # Transpose batch and sequence dims
        X = X.transpose(0, 1)
        X, _ = self.LSTM1(X)
        # Transpose back
        X = X.transpose(0, 1)
        #X: [m, Tx, LSTM1_hidden_size] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.LSTM1_hidden_size])
        
        #dropout
        X = self.dropout(X)

        #LSTM2, reduce dimension
        # Transpose batch and sequence dims
        X = X.transpose(0, 1)
        _, X = self.LSTM2(X)
        X = X[0]
        # Transpose back
        X = X.transpose(0, 1)
        X = torch.squeeze(X)
        #X: [m, LSTM2_hidden_size] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, self.LSTM2_hidden_size])
        
        #dropout
        X = self.dropout(X)

        #linear
        X = self.linear(X)
        #X: [m, 1]
        #print(X.size(), type(X))
        assert X.size() == torch.Size([m, self.num_classes])
        
        return F.softmax(X)

In [152]:
model = Model2_LSTM(vocab, embeddings, num_classes)
if cuda:
    model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
epoch = 0

In [226]:
model

Model1_LR (
  (embedding): Embedding(313, 50, padding_idx=0)
  (linear): Linear (50 -> 5)
)

In [227]:
Y_predict.size()

torch.Size([132, 5])

In [231]:
#workaround: X_train_mask to float so div works. Must be done before .cuda() call
X_train_mask = torch.FloatTensor(X_train_mask.numpy().astype(float))
Y_predict = model(X_train, X_train_mask)
criterion = nn.CrossEntropyLoss()
loss = criterion(Y_predict, Y_train)
loss

Variable containing:
 1.6082
[torch.FloatTensor of size 1]

In [233]:
X_train_mask.size()

torch.Size([132, 1])

In [235]:
#TODO: use DataLoader, batches
#move workaround just before div
#workaround: X_train_mask to float so div works. Must be done before .cuda() call
X_train_mask = torch.FloatTensor(X_train_mask.numpy().astype(float))
X_test_mask = torch.FloatTensor(X_test_mask.numpy().astype(float))
for epoch_local in range(1000):
    #Forward pass
    model.train()

    Y_predict = model(X_train, X_train_mask)

    #Compute loss
    loss = criterion(Y_predict, Y_train)
    
    if epoch % 100 == 0:
        #Calculate train and test accuracy
        _, Y_predict = torch.max(Y_predict, 1)
        correct = (Y_predict == Y_train).sum()
        correct = correct.cpu().data.numpy()[0]
        accuracy_train = correct/Y_train.size(0)

        model.eval()
        Y_predict = model(X_test, X_test_mask)
        _, Y_predict = torch.max(Y_predict, 1)
        correct = (Y_predict == Y_test).sum()
        correct = correct.cpu().data.numpy()[0]
        accuracy_test = correct/Y_test.size(0)
        
        print("epoch {0:06d} loss {1:.4f} train acc {2:.4f} test acc {3:.4f}".format(epoch, loss.cpu().data.numpy()[0], accuracy_train, accuracy_test))

    #Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    epoch += 1

epoch 003300 loss 1.4388 train acc 0.5606 test acc 0.5357
epoch 003400 loss 1.4361 train acc 0.5606 test acc 0.5357
epoch 003500 loss 1.4335 train acc 0.5682 test acc 0.5357
epoch 003600 loss 1.4309 train acc 0.5682 test acc 0.5357
epoch 003700 loss 1.4284 train acc 0.5682 test acc 0.5357
epoch 003800 loss 1.4260 train acc 0.5758 test acc 0.5357
epoch 003900 loss 1.4236 train acc 0.5833 test acc 0.5357
epoch 004000 loss 1.4213 train acc 0.5833 test acc 0.5357
epoch 004100 loss 1.4191 train acc 0.5833 test acc 0.5536
epoch 004200 loss 1.4169 train acc 0.5833 test acc 0.5536


In [149]:
Y_predict = model(X_test, X_test_mask)
_, Y_predict = torch.max(Y_predict, 1)
Y_predict

Variable containing:
 4
 3
 2
 2
 2
 0
 3
 2
 1
 4
 1
 0
 0
 3
 1
 0
 0
 2
 0
 2
 0
 0
 4
 0
 3
 3
 2
 0
 1
 2
 0
 1
 3
 2
 0
 0
 2
 4
 1
 2
 1
 0
 0
 0
 2
 0
 0
 2
 3
 1
 1
 1
 3
 2
 2
 4
[torch.cuda.LongTensor of size 56 (GPU 0)]

In [151]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test.data.cpu().numpy(), Y_predict.data.cpu().numpy())

array([[ 6,  1,  0,  0,  0],
       [ 2,  5,  1,  0,  0],
       [ 4,  0, 13,  0,  1],
       [ 6,  2,  0,  8,  0],
       [ 0,  2,  1,  0,  4]], dtype=int64)

In [404]:
df_test[df_test[1] == 0]

Unnamed: 0,0,1
20,I love taking breaks\t,0
27,My grandmother is the love of my life\t,0
30,I miss you so much\t,0
41,I like your jacket \t,0
42,i miss her\t,0
45,I love you to the stars and back\t,0
51,family is all I have\t,0


In [405]:
for i in df_test[df_test[1] == 0].index:
    if Y_predict[i].data.numpy()[0] == 3:
        print(i, df_test.iloc[i][0])

20 I love taking breaks	
30 I miss you so much	
42 i miss her	
51 family is all I have	
