In [1]:
%load_ext autoreload
%autoreload 2

import torch
import re
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [32]:
import gensim.downloader as api

embeddings_index = api.load("glove-wiki-gigaword-100")

In [103]:
pd_train_origin = pd.read_csv("../data/raw/train.csv")

In [104]:
import operator 
from tqdm import tqdm
tqdm.pandas()

# Count word in the dataset
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


def check_coverage(vocab,embeddings_index):
    a = {} # Word in vocab
    oov = {}
    k = 0 # Total number of word in the embedding index
    i = 0 # Total number of word not in the embedding index
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word] # Find word in pre-trained embedding index
            k += vocab[word] 
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def remove_url(x):
    x = re.sub(r'http\S+', '', x)
    return x

# Remove url and lowercase

In [105]:
# Create function for lowercasing since glove is containing all lowercase token
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: x.lower())
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: remove_url(x))
sentences = pd_train_origin["text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 360030.84it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 275697.13it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 213360.06it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 149744.89it/s]


In [106]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 41095/41095 [00:00<00:00, 266370.14it/s]

Found embeddings for 36.87% of vocab
Found embeddings for  81.87% of all text





In [107]:
oov[:10]

[('i`m', 1959),
 ('it`s', 1068),
 ('don`t', 764),
 ('****', 719),
 ('can`t', 675),
 ('i`ll', 387),
 ('that`s', 353),
 ('mother`s', 328),
 ('didn`t', 317),
 ('i`ve', 305)]

# Remove time

In [108]:
def remove_time(x):
    x = re.sub('\d+:\d+', '', x)
    return x

In [109]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: remove_time(x))

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 149057.41it/s]


# Handle emoji

In [110]:
# Handling with emoji
import emot

emot_obj = emot.core.emot() 

# get and count emoji from corpus
def build_vocab_emoji(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        # get emoticons and emoji
        emojis = emot_obj.emoticons(sentence)['value']
        if len(emojis) != 0:
            for emoji in emojis:
                try:
                    vocab[emoji] += 1
                except KeyError:
                    vocab[emoji] = 1
    return vocab
                                          
vocab_emoji = build_vocab_emoji(pd_train_origin["text"])
vocab_emoji

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 172675.09it/s]


{';)': 120,
 ':]': 18,
 ':/': 33,
 'd:': 37,
 ':@': 12,
 '=/': 8,
 '*)': 5,
 ':|': 16,
 '=]': 7,
 ':>': 3,
 ':o': 30,
 ':o)': 2,
 ':$': 1,
 '=p': 9,
 ':[': 6,
 ';;': 5,
 '8-)': 2,
 ':*': 5,
 ':3': 4,
 ':-0': 1,
 ":'(": 1,
 ':b': 2,
 '=3': 1,
 ':{': 1,
 ":')": 1,
 '%)': 3,
 ':x': 1,
 ':-*': 1}

In [111]:
emot_obj.emoticons('<3')

{'value': [], 'location': [], 'mean': [], 'flag': False}

In [112]:
# remove *), 0:3, :o) - not an emoji following the context of sentence

# vocab_emoji.pop('*)')
# vocab_emoji.pop('0:3')
# vocab_emoji.pop(':o)')
# vocab_emoji.pop(';;')
# vocab_emoji.pop('%)')
# vocab_emoji.pop('=3')

vocab_emoji = {
    ';)': 'smirk',
    ':]': 'smiley',
    ':/': 'skeptical',
    'd:': 'cheeky',
    ':@': 'sad',
    '=/': 'annoyed',
    ':|': 'neutral',
    '=]': 'happy',
    ':>': 'happy',
    ':o': 'surprise',
    ':$': 'blushing',
    '=p': 'cheeky',
    ':[': 'sad',
    '8-)': 'happy',
    ':*': 'kiss',
    ':-0': 'shock',
    ":'(": 'crying',
    ":b": 'cheeky',
    ":{": 'sad',
    ":')": 'sad',
    ':x': 'mute',
    ':-*': 'kiss',
    ':3': 'happy'
}
        
def fix_emoji(x):
    emojis = emot_obj.emoticons(x)
    for value in emojis['value']:
        if value in vocab_emoji:
            x = x.replace(value, vocab_emoji[value])
        else:
            continue
    return x

In [113]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_emoji(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 130354.20it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 170652.81it/s]
100%|█████████████████████████████████████████████████████████████████████████| 40963/40963 [00:00<00:00, 280314.16it/s]


Found embeddings for 36.84% of vocab
Found embeddings for  81.91% of all text


In [114]:
vocab_emoji = build_vocab_emoji(pd_train_origin["text"])
vocab_emoji

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 166332.12it/s]


{'*)': 5, ':o)': 2, ';;': 5, '=3': 1, '%)': 3}

# Fix heart emoji

In [115]:
def fix_heart(x):
    x = re.sub('<3+', 'love', x)
    return x

In [117]:
pd_train_origin['text'] = pd_train_origin['text'].progress_apply(lambda x: fix_heart(x))

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 258362.37it/s]


# Resolve contraction and slang

In [118]:
import contractions
from textacy.preprocessing.normalize import quotation_marks

pd_train_origin['text'] = pd_train_origin['text'].progress_apply(lambda x: quotation_marks(x))
pd_train_origin['text'] = pd_train_origin['text'].progress_apply(lambda x: contractions.fix(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 168958.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 56694.06it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 163335.96it/s]


In [119]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 40585/40585 [00:00<00:00, 274404.21it/s]

Found embeddings for 37.02% of vocab
Found embeddings for  85.14% of all text





In [120]:
oov[:20]

[('****', 719),
 ("mother's", 328),
 ('it.', 240),
 ('day!', 173),
 ('now.', 165),
 ('today.', 154),
 ('you!', 153),
 ('you.', 149),
 ('it!', 145),
 ('day.', 125),
 ('..', 117),
 ('yeah,', 115),
 ('too.', 112),
 ('though.', 108),
 ('well,', 103),
 ('you?', 98),
 ('lol.', 96),
 ('it,', 95),
 ('me!', 94),
 ('tomorrow.', 90)]

# Remove symbol

In [121]:
# Remove the punnctuation and other stuff
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [122]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: clean_text(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 99596.20it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 167276.26it/s]


In [123]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25168/25168 [00:00<00:00, 226080.62it/s]

Found embeddings for 68.78% of vocab
Found embeddings for  97.06% of all text





In [124]:
# special characters and internet slang are not vectorize in Glove
# Some words can be fix? hahahah is one example which we can reduce it down to ha or haha
# And fix some common misspellings 
# Needs to find a way to solve abbreviated words
oov[:10]

[('hahaha', 94),
 ('lmao', 65),
 ('hahah', 28),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('hahahaha', 20),
 ('tweeps', 15),
 ('2moro', 14)]

In [29]:
vocab_emoji = build_vocab_emoji(pd_train_origin["text"])
vocab_emoji

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 144233.60it/s]


{}

# Fix laughing words

In [125]:
pd_train_origin["text"][44]

'hahahahaha i rember when i riped that william picture out of one of claires mags i beated zoe to it'

In [126]:
def fix_duplicate_words(x):
    # Change hahahahaha or lolololo to haha
    x = re.sub(r'\b(?:a*(?:ha)+h?|(?:l+o+)+l+)\b', 'haha', x)
    return x

In [127]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_duplicate_words(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 84208.04it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 175681.97it/s]


In [128]:
oov = check_coverage(vocab,embeddings_index)
oov[:10]

100%|█████████████████████████████████████████████████████████████████████████| 25137/25137 [00:00<00:00, 262327.28it/s]

Found embeddings for 68.84% of vocab
Found embeddings for  97.12% of all text





[('lmao', 65),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('tweeps', 15),
 ('2moro', 14),
 ('awsome', 14),
 ('soooooo', 14),
 ('tooo', 13)]

# Fix misspelling word

In [129]:
misspelling = {
    'bday': 'birthday',
    'itll': 'it will',
    'youve': 'you have',
    'idk': 'i do not know',
    'followfriday': 'follow friday',
    'shouldnt': 'should not',
    'tonights': 'tonight',
    'sux': 'suck',
    'mommys': 'mommy',
    'werent': 'were not',
    'everyones': 'everyone',
    'theyve': 'they have',
    'lmao': 'haha',
    'LMAO': 'haha',
    'awsome': 'awesome',
}

In [130]:
# Fix soooooo to so and Lmao
def fix_word(x):
    # fix soooooo to so    
    x = re.sub(r'\b(?:s+o+)+\b', 'so', x)
    # fix lmao and LMAO to haha
    for word in x.split():
        if word in misspelling.keys():
            x = x.replace(word, misspelling[word])
    return x

In [131]:
pd_train_origin["text"] = pd_train_origin["text"].progress_apply(lambda x: fix_word(x))
sentences = pd_train_origin["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 67733.24it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 179766.48it/s]


In [134]:
oov[:10]

[('lmao', 65),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('tweeps', 15),
 ('2moro', 14),
 ('awsome', 14),
 ('soooooo', 14),
 ('tooo', 13)]

In [140]:
contractions.fix("lmao", slang=True)

'lmao'