In [1]:
%load_ext autoreload
%autoreload 2

import torch
import re
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [2]:
import gensim.downloader as api

embeddings_index = api.load("glove-wiki-gigaword-100")

In [3]:
pd_train_origin = pd.read_csv("../data/raw/train.csv")

In [4]:
import operator 
from tqdm import tqdm
tqdm.pandas()

# Count word in the dataset
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


def check_coverage(vocab,embeddings_index):
    a = {} # Word in vocab
    oov = {}
    k = 0 # Total number of word in the embedding index
    i = 0 # Total number of word not in the embedding index
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word] # Find word in pre-trained embedding index
            k += vocab[word] 
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def remove_url(x):
    x = re.sub(r'http\S+', '', x)
    return x

# Remove url and lowercase

In [5]:
# Create function for lowercasing since glove is containing all lowercase token
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: x.lower())
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: remove_url(x))
sentences = pd_train_origin["text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 260229.66it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 183276.72it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 174176.08it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 151604.01it/s]


In [6]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 41095/41095 [00:00<00:00, 196261.98it/s]


Found embeddings for 36.87% of vocab
Found embeddings for  81.87% of all text


In [7]:
oov[:10]

[('i`m', 1959),
 ('it`s', 1068),
 ('don`t', 764),
 ('****', 719),
 ('can`t', 675),
 ('i`ll', 387),
 ('that`s', 353),
 ('mother`s', 328),
 ('didn`t', 317),
 ('i`ve', 305)]

# Handle emoji

In [8]:
# Handling with emoji
import emot

emot_obj = emot.core.emot() 

# get and count emoji from corpus
def build_vocab_emoji(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        # get emoticons and emoji
        emojis = emot_obj.emoticons(sentence)['value']
        if len(emojis) != 0:
            for emoji in emojis:
                try:
                    vocab[emoji] += 1
                except KeyError:
                    vocab[emoji] = 1
    return vocab
                                          
vocab_emoji = build_vocab_emoji(pd_train_origin["text"])
vocab_emoji

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 150061.37it/s]


{';)': 120,
 ':]': 18,
 ':/': 33,
 ':3': 53,
 'd:': 37,
 ':@': 12,
 '=/': 8,
 '*)': 5,
 ':|': 16,
 '=]': 7,
 ':>': 3,
 ':o': 30,
 '0:3': 8,
 ':o)': 2,
 ':$': 1,
 '=p': 9,
 ':[': 6,
 ';;': 5,
 '8-)': 2,
 ':*': 5,
 ':-0': 1,
 ":'(": 1,
 ':b': 2,
 '=3': 1,
 ':{': 1,
 ":')": 1,
 '%)': 3,
 ':x': 1,
 ':-*': 1}

In [38]:
# remove *), 0:3, :o) - not an emoji following the context of sentence

# vocab_emoji.pop('*)')
# vocab_emoji.pop('0:3')
# vocab_emoji.pop(':3')
# vocab_emoji.pop(':o)')
# vocab_emoji.pop(';;')
# vocab_emoji.pop('%)')
# vocab_emoji.pop('=3')

vocab_emoji = {
    ';)': 'smirk',
    ':]': 'smiley',
    ':/': 'skeptical',
    'd:': 'cheeky',
    ':@': 'sad',
    '=/': 'annoyed',
    ':|': 'neutral',
    '=]': 'happy',
    ':>': 'happy',
    ':o': 'surprise',
    ':$': 'blushing',
    '=p': 'cheeky',
    ':[': 'sad',
    '8-)': 'happy',
    ':*': 'kiss',
    ':-0': 'shock',
    ":'(": 'crying',
    ":b": 'cheeky',
    ":{": 'sad',
    ":')": 'sad',
    ':x': 'mute',
    ':-*': 'kiss',
}
        
def fix_emoji(x):
    emojis = emot_obj.emoticons(x)
    for value in emojis['value']:
        if value in vocab_emoji:
            x = x.replace(value, vocab_emoji[value])
        else:
            continue
    return x

In [39]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_emoji(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 105377.74it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 148127.27it/s]
100%|█████████████████████████████████████████████████████████████████████████| 41076/41076 [00:00<00:00, 213933.35it/s]

Found embeddings for 36.88% of vocab
Found embeddings for  81.90% of all text





In [40]:
vocab_emoji = build_vocab_emoji(pd_train_origin["text"])
vocab_emoji

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 141710.73it/s]


{':3': 53, '*)': 5, '0:3': 8, ':o)': 2, ';;': 5, '=3': 1, '%)': 3}

# Remove punctuation

In [26]:
# Remove the punnctuation and other stuff
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [17]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: clean_text(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 65494.17it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 156191.32it/s]


In [18]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25608/25608 [00:00<00:00, 184326.41it/s]


Found embeddings for 68.20% of vocab
Found embeddings for  96.73% of all text


In [20]:
# special characters and internet slang are not vectorize in Glove
# Some words can be fix? hahahah is one example which we can reduce it down to ha or haha
# And fix some common misspellings 
# Needs to find a way to solve abbreviated words
oov[:10]

[('hahaha', 94),
 ('lmao', 65),
 ('bday', 48),
 ('youve', 45),
 ('itll', 36),
 ('idk', 36),
 ('hahah', 28),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25)]

# Fix laughing words

In [41]:
def fix_duplicate_words(x):
    # Change hahahahaha or lolololo to haha
    x = re.sub(r'\b(?:a*(?:ha)+h?|(?:l+o+)+l+)\b', 'haha', x)
    return x

In [42]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_duplicate_words(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 60710.81it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 155814.07it/s]


In [23]:
oov = check_coverage(vocab,embeddings_index)
oov[:10]

100%|█████████████████████████████████████████████████████████████████████████| 25577/25577 [00:00<00:00, 168623.68it/s]

Found embeddings for 68.26% of vocab
Found embeddings for  96.79% of all text





[('lmao', 65),
 ('bday', 48),
 ('youve', 45),
 ('itll', 36),
 ('idk', 36),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('shouldnt', 19)]

# Fix misspelling word

In [24]:
misspelling = {
    'bday': 'birthday',
    'itll': 'it will',
    'youve': 'you have',
    'idk': 'i do not know',
    'followfriday': 'follow friday',
    'shouldnt': 'should not',
    'tonights': 'tonight',
    'sux': 'suck',
    'mommys': 'mommy',
    'werent': 'were not',
    'everyones': 'everyone',
    'theyve': 'they have',
    'lmao': 'haha',
    'LMAO': 'haha',
    'awsome': 'awesome',
}

In [25]:
# Fix soooooo to so and Lmao
def fix_word(x):
    # fix soooooo to so    
    x = re.sub(r'\b(?:s+o+)+\b', 'so', x)
    # fix lmao and LMAO to haha
    for word in x.split():
        if word in misspelling.keys():
            x = x.replace(word, misspelling[word])
    return x

In [26]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_word(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 52490.38it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 152609.45it/s]


In [28]:
oov[:10]

[('lmao', 65),
 ('bday', 48),
 ('youve', 45),
 ('itll', 36),
 ('idk', 36),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('shouldnt', 19)]