In [55]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from gensim.models import Word2Vec
import torch

In [53]:
UNK = "<unk>"
wordEmbSize = 64
data = pd.read_csv("data2.csv")

# Data Preprocessing

+ Cleaning by using nltk word tokenizer and lemmatizer
+ Adds spaces to emojis to separate them to different words using emoji library's re
+ Add a start and end token
+ Build vocab for words
+ Build vocab for emojis
+ Makes labels as 0 or 1 for each word. If label is 1, means that word is followed by an emoji

In [52]:
RE_EMOJI = emoji.get_emoji_regexp()
tokenizer = nltk.word_tokenize
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#converting text to words
def preprocessing(data, train=True):
    newData = {"words":[], "labels":[]}
    for text in data["texts"]:
        #converting to words
        emoji_split = RE_EMOJI.split(text)
        emoji_split = [x.strip() for x in emoji_split if x]
        text = " ".join(emoji_split)
        textWords = LemNormalize(text)
        textWords.insert(0,"<s>")
        textWords.append("</s>")
        newData["words"].append(textWords)
        
        #getting labels
        labels = []
        if train:
            for i in range(1, len(textWords)):
                word = textWords[i]
                if RE_EMOJI.match(word):
                    labels.append(1)
                else:
                    labels.append(0)
            labels.append(0)
        else:
            labels = [0 * len(textWords)]
        newData["labels"].append(labels)
    return pd.DataFrame(newData)

def make_vocabs(data):
    vocab = set()
    vocab.add(UNK)
    emojiVocab = set()
    for text in data["words"]:
        for word in text:
            vocab.add(word)
            if RE_EMOJI.match(word):
                emojiVocab.add(word)
    return vocab, emojiVocab
        
train = preprocessing(data, True)
vocab, emojiVocab = make_vocabs(train)

# Word Embeddings

+ Using gensim's Word2Vec
+ Builds model with word embedding size specified earlier

In [58]:
def getEmbModel(data, vocab):
    docs = [[UNK]]
    docs.extend(data["words"])
    model = Word2Vec(docs, min_count = 1, size = wordEmbSize)
    print(model)
    return model

# Takes: dataset, word2vec model, and vocabulary from training
# returns: list of tuples. First value is a torch of word embeddings for that sentence,
# second value is the labels for each word
def getEmb(data, model, vocab):
    vecData = []
    for text,y in zip(data["words"],data["labels"]):
        wordEmb = []
        for word in text:
            if word in vocab:
                wordEmb.append(model[word])
            else:
                wordEmb.append(model[UNK])
        wordEmb = torch.FloatTensor(wordEmb)
        vecData.append((wordEmb, y))
    return vecData

model = getEmbModel(train, vocab)
trainEmb = getEmb(train, model, vocab)

Word2Vec(vocab=28845, size=64, alpha=0.025)


  


In [72]:
sample = train.sample(5)
for index,row in sample.iterrows():
    print(row["words"])
    print(row["labels"])
    print(trainEmb[index][0])

['<s>', 'shut', 'the', 'fuck', 'up', 'im', 'so', 'tired', 'of', 'that', 'shit', 'it', 'not', 'funnyyyyyyyyyy', 'who', 'joe', 'shut', 'the', 'fuck', 'up', 'im', 'so', 'fucking', 'tired', 'of', 'these', 'cringey', 'normie', 'memers', 'who', 'think', 'this', 'shit', 'sooooo', 'funny', 'it', 'not', 'eat', 'my', 'as', 'joe', 'for', 'all', 'i', 'care', 'i', 'dont', 'care', 'who', 'you', 'are', 'ugh', 'fuck', 'im', 'gon', 'na', 'go', 'jack', 'off', 'to', 'some', 'gay', 'furry', 'porn', 'im', 'so', 'pissed', 'off', '😑', '😭', '😫', '😡', '😤', '😤', '😤', '</s>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0]
tensor([[-0.6602,  0.9093, -0.5563,  ...,  0.2494, -0.5998, -0.1178],
        [-0.0802,  0.0947, -0.1015,  ...,  0.0599,  0.0133,  0.1009],
        [-1.4945, -0.6106,  0.5469,  ..., -0.3596, -1.3443, -0.7936],
     