In [19]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from gensim.models import Word2Vec
import torch
import time

In [12]:
UNK = "<unk>"
empty = "<empty>"
wordEmbSize = 64
data = pd.read_csv("data3.csv")

# Data Preprocessing

+ Cleaning by using nltk word tokenizer and lemmatizer
+ Adds spaces to emojis to separate them to different words using emoji library's re
+ Add a start and end token
+ Build vocab for words
+ Build vocab for emojis
+ Makes labels as 0 or 1 for each word. If label is 1, means that word is followed by an emoji

In [18]:
RE_EMOJI = emoji.get_emoji_regexp()
tokenizer = nltk.word_tokenize
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#converting text to words
def preprocessing(data, train=True):
    newData = {"words":[], "labels":[]}
    for text in data["texts"]:
        #converting to words
        emoji_split = RE_EMOJI.split(text)
        emoji_split = [x.strip() for x in emoji_split if x]
        text = " ".join(emoji_split)
        textWords = LemNormalize(text)
        textWords.insert(0,"<s>")
        textWords.append("</s>")
        newData["words"].append(textWords)
        
        #getting labels
        labels = []
        if train:
            for i in range(1, len(textWords)):
                word = textWords[i]
                if RE_EMOJI.match(word):
                    labels.append(1)
                else:
                    labels.append(0)
            labels.append(0)
        else:
            labels = [0 * len(textWords)]
        newData["labels"].append(labels)
    return pd.DataFrame(newData)

def make_vocabs(data):
    vocab = set()
    vocab.add(UNK)
    emojiVocab = set()
    emojiVocab.add(empty)
    for text in data["words"]:
        for word in text:
            vocab.add(word)
            if RE_EMOJI.match(word):
                emojiVocab.add(word)
    return vocab, emojiVocab
        
train = preprocessing(data, True)
vocab, emojiVocab = make_vocabs(train)
vocabIdx = {word : i for i, word in enumerate(vocab)}
eVocabIdx = {emoji : i for i, emoji in enumerate(emojiVocab)}

# Word Embeddings

+ Using gensim's Word2Vec
+ Builds model with word embedding size specified earlier

In [5]:
def getEmbModel(data, vocab):
    docs = [[UNK]]
    docs.extend(data["words"])
    model = Word2Vec(docs, min_count = 1, size = wordEmbSize)
    print(model)
    return model

# Takes: dataset, word2vec model, and vocabulary from training
# returns: list of tuples. First value is a torch of word embeddings for that sentence,
# second value is the labels for each word
def getEmb(data, model, vocab):
    vecData = []
    for text,y in zip(data["words"],data["labels"]):
        wordEmb = []
        for word in text:
            if word in vocab:
                wordEmb.append(model[word])
            else:
                wordEmb.append(model[UNK])
        wordEmb = torch.FloatTensor(wordEmb)
        vecData.append((wordEmb, y))
    return vecData

model = getEmbModel(train, vocab)
trainEmb = getEmb(train, model, vocab)

Word2Vec(vocab=19720, size=64, alpha=0.025)




In [8]:
sample = train.sample(5)
for index,row in sample.iterrows():
    print(row["words"])
    print(row["labels"])
    print(trainEmb[index][0])

['<s>', '“', 'damnn', 'gurl', 'you', 'fine', 'a', 'fuck', '💦', '💦', '💦', 'did', 'you', 'fall', 'from', 'heaven', 'because', 'you', 'have', 'the', 'phattest', 'as', 'on', 'god', '🍑', '🍑', 'you', 'do', 'track', 'cool', 'because', 'i', 'can', 'track', 'dat', 'as', 'a', 'u', 'and', 'me', 'go', 'crazy', 'babey', '😎', 'hey', 'you', 'forgot', 'to', 'fill', 'out', 'this', 'survey', 'haha', 'just', 'kidding', 'that', '’', 's', 'my', 'phone', '😎', 'gim', 'me', 'ur', 'number', 'so', 'we', 'can', 'talk', 'all', 'night', '😉', 'what', '’', 's', 'that', 'you', '’', 're', 'single', 'haha', 'i', 'never', 'knew', 'but', 'what', 'a', 'coincidence', 'let', '’', 's', 'get', 'not', 'single', 'together', '😎', 'i', 'may', 'not', 'run', 'but', 'i', '’', 'm', 'boutta', 'run', 'up', 'on', 'dat', 'azz', '😤', 'haha', 'what', '’', 's', 'that', 'you', 'aren', '’', 't', 'looking', 'for', 'a', 'boyfriend', 'well', 'that', '’', 's', 'perfect', 'because', 'i', '’', 'm', 'no', 'boy', '😎', '</s>']
[0, 0, 0, 0, 0, 0, 0, 1,

# Building supervised models to predict next emoji

+ RNN based architecture where we look at the hidden layer for every word
  + Using hidden layer, predict if is an emoji and what emoji it is
+ Asked TA from NLP class, they said this is similar to a language modelling problem where we only predict the set of emoji vocabulary
  + Could also view as sequence labelling where tag is next emoji or no emoji

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import random

# Feedforward NN

+ https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+ Built using previous couple of words

In [31]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, eVocab_size, embedding_dim, context_size, hidden):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden)
        self.activation = nn.ReLU()
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
        self.linear2 = nn.Linear(hidden, eVocab_size)

    def compute_loss(self, predicted_vector, label):
        return self.loss(predicted_vector, label)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.activation(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = self.softmax(out)
        return log_probs

In [32]:
def FFfeatures(data):
    trigrams = []
    for text in data["words"]:
        currTri = []
        for i in range(len(text) - 2):
            predictWord = text[i+2]
            if not RE_EMOJI.match(predictWord):
                predictWord = empty
            currTri.append([[text[i], text[i+1]], predictWord])
        trigrams.append(currTri)
    return trigrams

In [33]:
train_feats = FFfeatures(train)

In [35]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

losses = []
model = NGramLanguageModeler(len(vocab), len(emojiVocab), EMBEDDING_DIM, CONTEXT_SIZE, 128)
optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    total_loss = 0
    print("Training started for epoch:{}".format(epoch))
    random.shuffle(train_feats)
    start_time = time.time()
    correct = 0
    total = 0
    minibatch_size = 10
    N = len(train_feats)
    for text in tqdm(train_feats):
        for context, target in text:
            context_idx = torch.tensor([vocabIdx[w] for w in context], dtype=torch.long)
            optimizer.zero_grad()
            log_probs = model(context_idx)
            loss = model.compute_loss(log_probs, torch.tensor([eVocabIdx[target]]))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            predicted_label = torch.argmax(log_probs)
            correct += int(predicted_label == eVocabIdx[target])
            total += 1
    losses.append(total_loss)
    print("Training completed for epoch:{}".format(epoch))
    print("Time for train:{}".format(time.time() - start_time))
    print("Accuracy:{}".format(correct / total))

  0%|          | 0/1223 [00:00<?, ?it/s]

Training started for epoch:0


 10%|▉         | 121/1223 [01:51<25:03,  1.36s/it] 

KeyboardInterrupt: 