In [1]:
import numpy as np
import unicodedata
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.autograd import Variable


In [2]:
import spacy
nlp = spacy.load('en')

In [23]:
num_classes = 5

#glove_file = 'glove/glove.6B.50d.txt'
#glove_dim = 50
glove_file = 'glove/glove.840B.300d.txt'
glove_dim = 300
use_lemmatization = False

cuda = True

In [4]:
def normalize_text(text):
    return unicodedata.normalize('NFD', text)

def load_glove_vocab(wv_file, wv_dim):
    vocab = set()
    with open(wv_file, encoding="utf8") as f:
        for line in f:
            elems = line.split()
            token = normalize_text(''.join(elems[0:-wv_dim]))
            vocab.add(token)
    return vocab

def build_embedding(wv_file, wv_dim, target_vocab):
    vocab_size = len(target_vocab)
    emb = np.random.uniform(-1, 1, (vocab_size, wv_dim))
    emb[0] = 0 # <PAD> should be all 0 (using broadcast)

    w2id = {w: i for i, w in enumerate(target_vocab)}
    with open(wv_file, encoding="utf8") as f:
        for line in f:
            elems = line.split()
            token = normalize_text(''.join(elems[0:-wv_dim]))
            if token == '<PAD>':
                print(token)
            if token in w2id:
                emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]]
    return emb

def token2id(docs, vocab, unk_id=None):
    w2id = {w: i for i, w in enumerate(vocab)}
    ids = [[w2id[w] if w in w2id else unk_id for w in doc] for doc in docs]
    return ids

In [5]:
glove_vocab = load_glove_vocab(glove_file, glove_dim) 
print('glove loaded.')

glove loaded.


In [6]:
len(glove_vocab)

2195836

In [7]:
'sign in' in glove_vocab

False

In [8]:
count = 0
for x in glove_vocab:
    if '-' in x:
        print(x)
        count += 1
        if count == 100: break
count

finely-ground
HIGH-QUALITY
64-41
44-years-old
06-01-2009
Eight-channel
B-Flex
non-judgement
super-secretive
Euro-zone
-5.45
Procure-to-Pay
Verse-by-Verse
Sixty-Fifth
Double-Coated
02-08-2005
01-Mar-2011
14-01-2010
1-5x
WF-2540
dat-sick
now-destroyed
24-Apr-2005
THIRTY-FIVE
inter-element
q-o-q
2006-01-23
62-35
suitcase-style
non-renewables
360-Degree
co-organizer
-3177
30-Jul-2006
-4206
5-Yr
Tri-power
1878-10-25
1798-1800
e-code
101-106
candy-coat
Medi-Pak
0.93-1
well-rounded
RG-174
out-of-memory
Easy-to-learn
11-17-08
20th-Feb-2013
psycho-therapeutic
000-M16
Heart-Attack
time-pass
single-hit
Aug-10-2012
n-gon
anti-surge
Greensboro-High
Find-Locksmith
British-themed
2009-09-20
54-year-olds
408-410
Light-heavyweight
-4151
2-for-17
free-thought
Self-Immolation
one-subject
celui-ci
smart-alecky
TL-01
foster-care
X-Alp
League-Wide
above-average
4-core
backed-up
ragged-looking
Short-id
dma-mapping
bottle05-20-2013
US-6
173-186
www.china-led-fluorescent-tube.com/PRODUCT/LANG-af/led
Aug-10-09


100

In [311]:
docs_train = []
#docs_raw.append('Based on fully-aware attention, we propose an end-to-end architecture qaz123')
#docs_raw.append('Teaching machines to read, process and comprehend text and then answer questions is one of key problems in artificial intelligence')
docs_train.append('I am proud of your achievements')
docs_train

['I am proud of your achievements']

In [9]:
df_train = pd.read_csv('emoji/train_emoji.csv', header=None)
docs_train = df_train[0]
Y_train = Variable(torch.LongTensor(df_train[1].values), requires_grad=False)
df_train

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,
5,I love you mum,0,,
6,Stop saying bullshit,3,,
7,congratulations on your acceptance,2,,
8,The assignment is too long,3,,
9,I want to go play,1,,[3]


In [10]:
df_test = pd.read_csv('emoji/tesss.csv', header=None)
docs_test = df_test[0]
Y_test = Variable(torch.LongTensor(df_test[1].values), requires_grad=False)
df_test

Unnamed: 0,0,1
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a very nice raise\t,2
3,she got me a nice present\t,2
4,ha ha ha it was so funny\t,2
5,he is a good friend\t,2
6,I am upset\t,3
7,We had such a lovely dinner tonight\t,2
8,where is the food\t,4
9,Stop making this joke ha ha ha\t,2


In [8]:
nlp_doc = nlp(docs_train[0])
for token in nlp_doc:
    if not token.is_punct and not token.is_space:
        print(token.text, token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ , token.pos_, token.ent_type_, token.ent_iob_)


never never ADV  O
talk talk VERB  O
to to ADP  O
me me PRON  O
again again ADV  O


In [11]:
docs_train_tokens = []
for doc in docs_train:
    nlp_doc = nlp(doc)
    #keep stop words 
    #spacy doesn't have stemming, only lemmatization
    if use_lemmatization:
        tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    else:
        tokens = [token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    docs_train_tokens.append(tokens)

docs_train_tokens

[['never', 'talk', 'to', 'me', 'again'],
 ['i', 'am', 'proud', 'of', 'your', 'achievements'],
 ['it', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'],
 ['miss', 'you', 'so', 'much'],
 ['food', 'is', 'life'],
 ['i', 'love', 'you', 'mum'],
 ['stop', 'saying', 'bullshit'],
 ['congratulations', 'on', 'your', 'acceptance'],
 ['the', 'assignment', 'is', 'too', 'long'],
 ['i', 'want', 'to', 'go', 'play'],
 ['she', 'did', 'not', 'answer', 'my', 'text'],
 ['your', 'stupidity', 'has', 'no', 'limit'],
 ['how', 'many', 'points', 'did', 'he', 'score'],
 ['my', 'algorithm', 'performs', 'poorly'],
 ['i', 'got', 'approved'],
 ['stop', 'shouting', 'at', 'me'],
 ['sounds', 'like', 'a', 'fun', 'plan', 'ha', 'ha'],
 ['no', 'one', 'likes', 'him'],
 ['the', 'game', 'just', 'finished'],
 ['i', 'will', 'celebrate', 'soon'],
 ['so', 'sad', 'you', 'are', 'not', 'coming'],
 ['she', 'is', 'my', 'dearest', 'love'],
 ['good', 'job'],
 ['it', 'was', 'funny', 'lol'],
 ['candy', 'is', 'life'],
 ['the', 'chicago', 'cu

In [12]:
docs_test_tokens = []
for doc in docs_test:
    nlp_doc = nlp(doc)
    if use_lemmatization:
        tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    else:
        tokens = [token.lower_ for token in nlp_doc if not token.is_punct and not token.is_space]
    docs_test_tokens.append(tokens)

docs_test_tokens

[['i', 'want', 'to', 'eat'],
 ['he', 'did', 'not', 'answer'],
 ['he', 'got', 'a', 'very', 'nice', 'raise'],
 ['she', 'got', 'me', 'a', 'nice', 'present'],
 ['ha', 'ha', 'ha', 'it', 'was', 'so', 'funny'],
 ['he', 'is', 'a', 'good', 'friend'],
 ['i', 'am', 'upset'],
 ['we', 'had', 'such', 'a', 'lovely', 'dinner', 'tonight'],
 ['where', 'is', 'the', 'food'],
 ['stop', 'making', 'this', 'joke', 'ha', 'ha', 'ha'],
 ['where', 'is', 'the', 'ball'],
 ['work', 'is', 'hard'],
 ['this', 'girl', 'is', 'messing', 'with', 'me'],
 ['are', 'you', 'serious'],
 ['let', 'us', 'go', 'play', 'baseball'],
 ['this', 'stupid', 'grader', 'is', 'not', 'working'],
 ['work', 'is', 'horrible'],
 ['congratulation', 'for', 'having', 'a', 'baby'],
 ['stop', 'pissing', 'me', 'off'],
 ['any', 'suggestions', 'for', 'dinner'],
 ['i', 'love', 'taking', 'breaks'],
 ['you', 'brighten', 'my', 'day'],
 ['i', 'boiled', 'rice'],
 ['she', 'is', 'a', 'bully'],
 ['why', 'are', 'you', 'feeling', 'bad'],
 ['i', 'am', 'upset'],
 ['gi

In [13]:
all_docs = docs_train_tokens + docs_test_tokens
all_docs

[['never', 'talk', 'to', 'me', 'again'],
 ['i', 'am', 'proud', 'of', 'your', 'achievements'],
 ['it', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'],
 ['miss', 'you', 'so', 'much'],
 ['food', 'is', 'life'],
 ['i', 'love', 'you', 'mum'],
 ['stop', 'saying', 'bullshit'],
 ['congratulations', 'on', 'your', 'acceptance'],
 ['the', 'assignment', 'is', 'too', 'long'],
 ['i', 'want', 'to', 'go', 'play'],
 ['she', 'did', 'not', 'answer', 'my', 'text'],
 ['your', 'stupidity', 'has', 'no', 'limit'],
 ['how', 'many', 'points', 'did', 'he', 'score'],
 ['my', 'algorithm', 'performs', 'poorly'],
 ['i', 'got', 'approved'],
 ['stop', 'shouting', 'at', 'me'],
 ['sounds', 'like', 'a', 'fun', 'plan', 'ha', 'ha'],
 ['no', 'one', 'likes', 'him'],
 ['the', 'game', 'just', 'finished'],
 ['i', 'will', 'celebrate', 'soon'],
 ['so', 'sad', 'you', 'are', 'not', 'coming'],
 ['she', 'is', 'my', 'dearest', 'love'],
 ['good', 'job'],
 ['it', 'was', 'funny', 'lol'],
 ['candy', 'is', 'life'],
 ['the', 'chicago', 'cu

In [14]:
#ignore words not in glove vocabulary
all_docs = docs_train_tokens + docs_test_tokens
vocab = list(set([word for doc in all_docs for word in doc if word in glove_vocab]))
vocab_ignored = set([word for doc in all_docs for word in doc if word not in glove_vocab])
vocab.insert(0, "<PAD>")
vocab.insert(1, "<UNK>")
print(vocab)
print(vocab_ignored)

['<PAD>', '<UNK>', 'score', 'him', 'an', 'on', 'algorithm', 'funny', 'dogs', 'proud', 'grade', 'why', 'deserve', 'feeling', 'boiled', 'smile', 'brighten', 'her', 'taking', 'life', 'incredibly', 'had', 'restaurant', 'alone', 'ball', 'code', 'forever', 'who', 'friendly', 'shouting', 'ordering', 'let', 'off', 'valentine', 'pitch', 'yesterday', 'is', 'could', 'all', 'ever', 'hungry', 'by', 'us', 'prize', 'not', 'upset', 'stadium', 'bully', 'dance', 'smiles', 'mean', 'they', 'dearest', 'your', 'raise', 'cute', 'midterm', 'cheese', 'well', 'wallet', 'assignment', 'people', 'performs', 'fun', 'cookies', 'go', 'candy', 'text', 'have', 'like', 'family', 'failing', 'points', 'for', 'worst', 'when', 'congrats', 'are', 'how', 'hate', 'horrible', 'to', 'anything', 'guy', 'favorite', 'see', 'qualified', 'dare', 'suck', 'give', 'get', 'macaroon', 'new', 'sushi', 'my', 'cake', 'wrong', 'sounds', 'we', 'indian', 'with', 'grandmother', 'me', 'mum', 'again', 'lost', 'finished', 'adore', 'few', 'away', 'i

In [15]:
len(vocab)

313

In [16]:
docs_train_ids = token2id(docs_train_tokens, vocab, unk_id=1)
docs_train_ids

[[147, 120, 81, 102, 104],
 [280, 249, 9, 207, 53, 198],
 [202, 36, 145, 74, 236, 110, 94, 19],
 [199, 150, 118, 171],
 [143, 36, 19],
 [280, 212, 150, 103],
 [190, 220, 272],
 [213, 5, 53, 172],
 [145, 60, 36, 165, 170],
 [280, 250, 81, 65, 193],
 [189, 273, 44, 292, 94, 67],
 [53, 164, 234, 264, 262],
 [78, 271, 72, 273, 191, 2],
 [94, 6, 62, 225],
 [280, 231, 310],
 [190, 29, 203, 102],
 [97, 69, 287, 63, 132, 160, 160],
 [264, 200, 256, 3],
 [145, 274, 157, 106],
 [280, 214, 138, 298],
 [118, 227, 150, 77, 44, 257],
 [189, 36, 94, 52, 212],
 [149, 116],
 [202, 159, 7, 140],
 [66, 36, 19],
 [145, 206, 173, 255, 104],
 [280, 249, 40],
 [280, 249, 118, 268, 81, 85, 150, 237, 118, 170],
 [150, 273, 58, 5, 150, 269],
 [208, 288, 232, 236],
 [191, 36, 118, 55],
 [78, 87, 150, 278, 241],
 [254, 150, 250, 81, 146, 102, 73, 178],
 [280, 180, 222],
 [189, 36, 131],
 [150, 88],
 [189, 49, 287, 217],
 [191, 36, 294],
 [189, 174, 26, 81, 90, 136],
 [177, 91, 36, 118, 119],
 [98, 295, 202],
 [28

In [17]:
docs_test_ids = token2id(docs_test_tokens, vocab, unk_id=1)
docs_test_ids

[[280, 250, 81, 243],
 [191, 273, 44, 292],
 [191, 231, 287, 279, 307, 54],
 [189, 231, 102, 287, 307, 128],
 [160, 160, 160, 202, 159, 118, 7],
 [191, 36, 287, 149, 246],
 [280, 249, 45],
 [98, 21, 238, 287, 312, 178, 277],
 [182, 36, 145, 143],
 [190, 248, 228, 235, 160, 160, 160],
 [182, 36, 145, 24],
 [154, 36, 176],
 [228, 281, 36, 266, 100, 102],
 [77, 150, 215],
 [31, 42, 65, 193, 205],
 [228, 252, 305, 36, 44, 204],
 [154, 36, 80],
 [296, 73, 284, 287, 153],
 [190, 297, 102, 32],
 [152, 163, 73, 178],
 [280, 212, 18, 195],
 [150, 16, 94, 236],
 [280, 14, 187],
 [189, 36, 287, 47],
 [11, 77, 150, 13, 117],
 [280, 249, 45],
 [89, 102, 145, 24],
 [94, 101, 36, 145, 212, 207, 94, 19],
 [261, 53, 274],
 [33, 236, 36, 216],
 [280, 199, 150, 118, 171],
 [181, 145, 24],
 [94, 19, 36, 118, 142],
 [189, 180, 222],
 [214, 150, 133, 94, 33],
 [191, 239, 34, 226, 58],
 [48, 100, 102],
 [280, 249, 40],
 [85, 150, 203, 145, 22],
 [280, 69, 81, 123],
 [280, 214, 121],
 [280, 69, 53, 219],
 [28

In [18]:
embeddings = build_embedding(glove_file, glove_dim, vocab)
embeddings
#log.info('got embedding matrix for training.')

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.83393006,  0.05308072,  0.41082636, ...,  0.40094685,
         0.49901329, -0.23793404],
       [-0.42028   ,  0.693     , -0.056389  , ..., -0.029277  ,
         0.39451   , -0.17286   ],
       ..., 
       [ 0.20393   , -0.44596   ,  0.16273   , ..., -0.34378   ,
         0.32614   ,  0.49843   ],
       [-0.19102   ,  0.29065   , -0.016854  , ...,  0.43286   ,
         0.13729   ,  0.1192    ],
       [ 0.32202   ,  0.4238    , -0.6269    , ..., -0.11275   ,
        -0.25161   ,  0.45554   ]])

In [19]:
embeddings[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [None]:
#TODO (preprocess): save vocab, embeddings in metadata file, docs_ids in csv file

In [20]:
X_train = torch.LongTensor (len(docs_train_ids), max([len(doc) for doc in docs_train_ids])).fill_(0)
X_train_mask = torch.LongTensor (len(docs_train_ids)).fill_(0)
for index, doc in enumerate(docs_train_ids):
    X_train[index, :len(doc)] = torch.LongTensor(doc)
    X_train_mask[index] = len(doc)
    
X_train_mask = X_train_mask.unsqueeze(1)

print(X_train)
print(X_train_mask)


  147   120    81  ...      0     0     0
  280   249     9  ...      0     0     0
  202    36   145  ...     19     0     0
       ...          ⋱          ...       
  183    36    96  ...      0     0     0
  280   212   150  ...      0     0     0
  192   116     0  ...      0     0     0
[torch.LongTensor of size 132x10]


    5
    6
    8
    4
    3
    4
    3
    4
    5
    5
    6
    5
    6
    4
    3
    4
    7
    4
    4
    4
    6
    5
    2
    4
    3
    5
    3
   10
    6
    4
    4
    5
    8
    3
    3
    2
    4
    3
    6
    5
    3
    3
    4
    1
    4
    4
   10
    6
    4
    5
    3
    7
    6
    3
    3
    4
    2
    5
    6
    6
    4
    5
    5
    3
    7
    3
    4
    3
    5
   10
    3
    5
    4
    8
    1
    6
    3
    6
    9
    3
   10
    6
    1
    4
    7
    4
    5
    1
    4
    4
    4
    6
    3
    4
    4
    7
    8
    6
    3
    7
    4
    6
    2
    4
    5
    6
    5
    6
    4
    3
    7
   

In [21]:
X_test = torch.LongTensor (len(docs_test_ids), max([len(doc) for doc in docs_test_ids])).fill_(0)
X_test_mask = torch.LongTensor (len(docs_test_ids)).fill_(0)
for index, doc in enumerate(docs_test_ids):
    X_test[index, :len(doc)] = torch.LongTensor (doc)
    X_test_mask[index] = len(doc)
    
X_test_mask = X_test_mask.unsqueeze(1)

print(X_test)
print(X_test_mask)


  280   250    81   243     0     0     0     0
  191   273    44   292     0     0     0     0
  191   231   287   279   307    54     0     0
  189   231   102   287   307   128     0     0
  160   160   160   202   159   118     7     0
  191    36   287   149   246     0     0     0
  280   249    45     0     0     0     0     0
   98    21   238   287   312   178   277     0
  182    36   145   143     0     0     0     0
  190   248   228   235   160   160   160     0
  182    36   145    24     0     0     0     0
  154    36   176     0     0     0     0     0
  228   281    36   266   100   102     0     0
   77   150   215     0     0     0     0     0
   31    42    65   193   205     0     0     0
  228   252   305    36    44   204     0     0
  154    36    80     0     0     0     0     0
  296    73   284   287   153     0     0     0
  190   297   102    32     0     0     0     0
  152   163    73   178     0     0     0     0
  280   212    18   195     0     0    

In [24]:
if cuda:
    X_train = X_train.cuda()
    X_train_mask = X_train_mask.cuda()
    Y_train = Y_train.cuda()

    X_test = X_test.cuda()
    X_test_mask = X_test_mask.cuda()
    Y_test = Y_test.cuda()

In [25]:
print(type(X_train))
print(type(X_train_mask))
print(X_train.size())
print(X_train_mask.size())

<class 'torch.cuda.LongTensor'>
<class 'torch.cuda.LongTensor'>
torch.Size([132, 10])
torch.Size([132, 1])


In [260]:
test1 = torch.cat((X_train, X_train_mask), 1)
test2 = torch.cat((X_test, X_test_mask), 1)
#dataset = torch.utils.data.TensorDataset(torch.cat((X_train, X_train_mask), 1), torch.cat((X_test, X_test_mask), 1))
dataset = data_utils.TensorDataset(test1, Y_train)
dataset

<torch.utils.data.dataset.TensorDataset at 0x209c1181898>

In [171]:
seed = 0
#random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)
#TODO: if load model, synchronize random seed

In [201]:
#word embeddings, average, 
class Model1_LR(nn.Module):
    def __init__(self, vocab, embeddings, num_classes):
        super(Model1_LR, self).__init__()    
    
        self.num_classes = num_classes
        
        #embedding layer
        self.embedding_dim = len(embeddings[0])
        self.embedding = nn.Embedding(len(vocab),         #vocab size
                                      self.embedding_dim, #embedding_dim
                                      padding_idx=0)
        self.embedding.weight.data = torch.Tensor(embeddings)
        #do not backprop into embeddings
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        #linear layer
        self.linear = nn.Linear(self.embedding_dim, num_classes)
        #nn.init.xavier_normal(self.linear.weight)
        #self.linear.bias.data.zero_()
     
    def forward(self, X, X_mask):
        #X: [m, Tx] m = batch size, Tx = word count
        #print(X.size(), type(X))
        m = X.size()[0]
        Tx = X.size()[1]
        
        X = self.embedding(X)
        #X: [m, Tx, embedding_dim] m = batch size, Tx = word count
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.embedding_dim])
                
        #average words in doc. use mask so we average only words not padding
        X = torch.sum(X, 1)
        X = Variable(torch.div(X.data, X_mask))
        #X: [m, emb_dim]
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, self.embedding_dim])
        
        X = self.linear(X)
        #X: [m, 1]
        #print(X.size(), type(X))
        assert X.size() == torch.Size([m, self.num_classes])
        
        return F.softmax(X)

In [225]:
model = Model1_LR(vocab, embeddings, num_classes)
if cuda:
    model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-2)
epoch = 0

In [26]:
class Model2_LSTM(nn.Module):
    def __init__(self, vocab, embeddings, num_classes):
        super(Model2_LSTM, self).__init__()    
    
        self.num_classes = num_classes
        
        #embedding layer
        self.embedding_dim = len(embeddings[0])
        self.embedding = nn.Embedding(len(vocab),         #vocab size
                                      self.embedding_dim, #embedding_dim
                                      padding_idx=0)
        self.embedding.weight.data = torch.Tensor(embeddings)
        #do not backprop into embeddings
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        #LSTM1, hidden_size = 128
        #TODO: try bidirectional=True
        self.LSTM1_hidden_size = 128
        self.LSTM1 = nn.LSTM(self.embedding_dim, self.LSTM1_hidden_size)
        
        #dropout
        self.dropout = nn.Dropout()
     
        #LSTM, hidden_size = 128
        #TODO: try bidirectional=True
        self.LSTM2_hidden_size = 128
        self.LSTM2 = nn.LSTM(self.LSTM1_hidden_size, self.LSTM2_hidden_size)
        
        #linear layer
        self.linear = nn.Linear(self.LSTM2_hidden_size, num_classes)

    def forward(self, X, X_mask):
        #X: [m, Tx] m = batch size, Tx = word count
        #print(X.size(), type(X))
        m = X.size()[0]
        Tx = X.size()[1]

        #embedding layer
        X = self.embedding(X)
        #X: [m, Tx, embedding_dim] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.embedding_dim])
           
        #LSTM1
        # Transpose batch and sequence dims
        X = X.transpose(0, 1)
        X, _ = self.LSTM1(X)
        # Transpose back
        X = X.transpose(0, 1)
        #X: [m, Tx, LSTM1_hidden_size] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, Tx, self.LSTM1_hidden_size])
        
        #dropout
        X = self.dropout(X)

        #LSTM2, reduce dimension
        # Transpose batch and sequence dims
        X = X.transpose(0, 1)
        _, X = self.LSTM2(X)
        X = X[0]
        # Transpose back
        X = X.transpose(0, 1)
        X = torch.squeeze(X)
        #X: [m, LSTM2_hidden_size] 
        #print(X.size(), type(X.data))
        assert X.size() == torch.Size([m, self.LSTM2_hidden_size])
        
        #dropout
        X = self.dropout(X)

        #linear
        X = self.linear(X)
        #X: [m, 1]
        #print(X.size(), type(X))
        assert X.size() == torch.Size([m, self.num_classes])
        
        return F.softmax(X)

In [27]:
model = Model2_LSTM(vocab, embeddings, num_classes)
if cuda:
    model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
epoch = 0

In [28]:
model

Model2_LSTM (
  (embedding): Embedding(313, 300, padding_idx=0)
  (LSTM1): LSTM(300, 128)
  (dropout): Dropout (p = 0.5)
  (LSTM2): LSTM(128, 128)
  (linear): Linear (128 -> 5)
)

In [30]:
#workaround: X_train_mask to float so div works. Must be done before .cuda() call
X_train_mask = torch.FloatTensor(X_train_mask.numpy().astype(float))
Y_predict = model(X_train, X_train_mask)
criterion = nn.CrossEntropyLoss()
loss = criterion(Y_predict, Y_train)
loss

RuntimeError: can't convert CUDA tensor to numpy (it doesn't support GPU arrays). Use .cpu() to move the tensor to host memory first.

In [233]:
X_train_mask.size()

torch.Size([132, 1])

In [32]:
#TODO: use DataLoader, batches
#move workaround just before div
#workaround: X_train_mask to float so div works. Must be done before .cuda() call
#X_train_mask = torch.FloatTensor(X_train_mask.numpy().astype(float))
#X_test_mask = torch.FloatTensor(X_test_mask.numpy().astype(float))
for epoch_local in range(1000):
    #Forward pass
    model.train()

    Y_predict = model(X_train, X_train_mask)

    #Compute loss
    loss = criterion(Y_predict, Y_train)
    
    if epoch % 100 == 0:
        #Calculate train and test accuracy
        _, Y_predict = torch.max(Y_predict, 1)
        correct = (Y_predict == Y_train).sum()
        correct = correct.cpu().data.numpy()[0]
        accuracy_train = correct/Y_train.size(0)

        model.eval()
        Y_predict = model(X_test, X_test_mask)
        _, Y_predict = torch.max(Y_predict, 1)
        correct = (Y_predict == Y_test).sum()
        correct = correct.cpu().data.numpy()[0]
        accuracy_test = correct/Y_test.size(0)
        
        print("epoch {0:06d} loss {1:.4f} train acc {2:.4f} test acc {3:.4f}".format(epoch, loss.cpu().data.numpy()[0], accuracy_train, accuracy_test))

    #Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    epoch += 1

epoch 000000 loss 1.6110 train acc 0.1364 test acc 0.1250
epoch 000100 loss 0.9233 train acc 0.9848 test acc 0.8929
epoch 000200 loss 0.9205 train acc 0.9848 test acc 0.9107
epoch 000300 loss 0.9203 train acc 0.9848 test acc 0.9107
epoch 000400 loss 0.9202 train acc 0.9848 test acc 0.9107
epoch 000500 loss 0.9201 train acc 0.9848 test acc 0.9107
epoch 000600 loss 0.9050 train acc 1.0000 test acc 0.8214
epoch 000700 loss 0.9050 train acc 1.0000 test acc 0.8393
epoch 000800 loss 0.9049 train acc 1.0000 test acc 0.8571
epoch 000900 loss 0.9049 train acc 1.0000 test acc 0.8571


In [33]:
Y_predict = model(X_test, X_test_mask)
_, Y_predict = torch.max(Y_predict, 1)
Y_predict

Variable containing:
 4
 3
 0
 0
 2
 0
 3
 4
 4
 2
 1
 3
 3
 3
 1
 3
 3
 2
 3
 4
 0
 2
 4
 3
 3
 3
 1
 0
 1
 0
 0
 1
 3
 2
 0
 1
 2
 4
 4
 2
 1
 0
 0
 1
 2
 0
 2
 2
 0
 1
 2
 0
 3
 2
 2
 4
[torch.cuda.LongTensor of size 56 (GPU 0)]

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test.data.cpu().numpy(), Y_predict.data.cpu().numpy())

array([[ 7,  0,  0,  0,  0],
       [ 0,  8,  0,  0,  0],
       [ 5,  0, 12,  0,  1],
       [ 1,  1,  1, 13,  0],
       [ 0,  0,  0,  0,  7]], dtype=int64)

In [404]:
df_test[df_test[1] == 0]

Unnamed: 0,0,1
20,I love taking breaks\t,0
27,My grandmother is the love of my life\t,0
30,I miss you so much\t,0
41,I like your jacket \t,0
42,i miss her\t,0
45,I love you to the stars and back\t,0
51,family is all I have\t,0


In [405]:
for i in df_test[df_test[1] == 0].index:
    if Y_predict[i].data.numpy()[0] == 3:
        print(i, df_test.iloc[i][0])

20 I love taking breaks	
30 I miss you so much	
42 i miss her	
51 family is all I have	
