In [1]:
%load_ext autoreload
%autoreload 2

import torch
import re
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [2]:
import gensim.downloader as api

embeddings_index = api.load("glove-wiki-gigaword-100")

In [3]:
train_df = pd.read_csv("../data/raw/train.csv")

In [4]:
import operator 
from tqdm import tqdm
tqdm.pandas()

# Count word in the dataset
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab,embeddings_index):
    a = {} # Word in vocab
    oov = {}
    k = 0 # Total number of word in the embedding index
    i = 0 # Total number of word not in the embedding index
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word] # Find word in pre-trained embedding index
            k += vocab[word] 
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [5]:
sentences = train_df["text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 124364.03it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 141187.26it/s]

{'good': 931, 'luck': 83, 'with': 1811, 'your': 969, 'auction': 1}





In [6]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 48435/48435 [00:00<00:00, 213246.19it/s]

Found embeddings for 25.98% of vocab
Found embeddings for  70.20% of all text





In [7]:
oov[:10]

[('I', 7902),
 ('I`m', 1533),
 ('****', 719),
 ('it`s', 695),
 ('don`t', 681),
 ('can`t', 562),
 ('My', 457),
 ('Just', 438),
 ('The', 416),
 ('i`m', 414)]

In [8]:
def remove_url(x):
    x = re.sub(r'http\S+', '', x)
    return x

# Remove the punnctuation and other stuff
def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def fix_laughing_words(x):
    # Change hahahahaha or lolololo to haha
    x = re.sub(r'\b(?:a*(?:ha)+h?|(?:l+o+)+l+)\b', 'haha', x)
    return x

In [9]:
train_df["text"] = train_df["text"].progress_apply(lambda x: x.lower())
train_df["text"] = train_df["text"].progress_apply(lambda x: remove_url(x))
train_df["text"] = train_df["text"].progress_apply(lambda x: clean_text(x))
train_df["text"] = train_df["text"].progress_apply(lambda x: fix_laughing_words(x))
sentences = train_df["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 346844.39it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 206961.24it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 90430.24it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 84493.78it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 163627.10it/s]


In [10]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 25577/25577 [00:00<00:00, 235439.89it/s]

Found embeddings for 68.26% of vocab
Found embeddings for  96.79% of all text





In [11]:
oov[:10]

[('lmao', 65),
 ('bday', 48),
 ('youve', 45),
 ('itll', 36),
 ('idk', 36),
 ('followfriday', 26),
 ('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('shouldnt', 19)]

In [12]:
misspelling = {
    'bday': 'birthday',
    'itll': 'it will',
    'youve': 'you have',
    'idk': 'i do not know',
    'followfriday': 'follow friday',
    'shouldnt': 'should not',
    'tonights': 'tonight',
    'sux': 'suck',
    'mommys': 'mommy',
    'werent': 'were not',
    'everyones': 'everyone',
    'theyve': 'they have',
    'lmao': 'haha',
    'LMAO': 'haha',
    'awsome': 'awesome',
}

# Fix soooooo to so and Lmao
def fix_word(x):
    # fix soooooo to so    
    x = re.sub(r'\b(?:s+o+)+\b', 'so', x)
    # fix lmao and LMAO to haha
    for word in x.split():
        if word in misspelling.keys():
            x = x.replace(word, misspelling[word])
    return x

In [13]:
train_df["text"] = train_df["text"].progress_apply(lambda x: fix_word(x))
sentences = train_df["text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 65319.18it/s]
100%|█████████████████████████████████████████████████████████████████████████| 24732/24732 [00:00<00:00, 159297.42it/s]


In [14]:
oov = check_coverage(vocab,embeddings_index)
oov[:10]

100%|█████████████████████████████████████████████████████████████████████████| 25548/25548 [00:00<00:00, 201836.25it/s]

Found embeddings for 68.31% of vocab
Found embeddings for  96.93% of all text





[('iï¿½m', 26),
 ('thanx', 25),
 ('ï¿½', 20),
 ('tweeps', 15),
 ('2moro', 14),
 ('hadnt', 13),
 ('tooo', 13),
 ('rofl', 12),
 ('hehehe', 12),
 ('gunna', 11)]

In [15]:
#  Remove the remain oov
def delete_oov(x, oov):
    for word in x.split():
        for vocab in oov:
            if word == vocab[0]:
                x = x.replace(word, '')
    return x

def preprocess(text):
    text = text.lower() ## Lowercase
    text = remove_url(text) ## Remove url
    text = clean_text(text) ## Remove punctuation and weird symbol
    text = fix_laughing_words(text) ## Fix hahahah and lololololo
    text = fix_word(text) # Fix misspelling
    return text # Get list of token 

def tokenize_fn(text):
    return text.split()

def embedding_fn(sentence):
    max_sentence = 10
    result = np.zeros((max_sentence, 100))
    if len(sentence) < max_sentence:
        for idx in range(len(sentence)):
            word = sentence[idx]
            try:
                result[idx, :] = embeddings_index[word]
            except:
                continue
    else:
        for idx in range(max_sentence):
            word = sentence[idx]
            try:
                result[idx, :] = embeddings_index[word]
            except:
                continue
    return result.flatten()

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class TextDataset(Dataset):
    def __init__(self, dataset_file,
                 tokenize_fn,
                 vectorizer_fn,
                 preprocess_fn=None,
                 sparse=False):
        self.dataset_file = dataset_file
        
        self.preprocess_fn = preprocess_fn
        self.tokenize_fn = tokenize_fn
        self.vectorizer_fn = vectorizer_fn
        self.sparse = sparse
        
        self.df = pd.read_csv(dataset_file)
        
        target_conversion = {
            'neutral': 0,
            'positive': 1,
            'negative': 2
        }
        if 'sentiment' in self.df:
            self.df["label"] = self.df['sentiment'].map(target_conversion)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        label = self.df.loc[idx, "label"] if 'label' in self.df else None
        
        if self.preprocess_fn is not None:
            text = self.preprocess_fn(text)
            
        tokens = self.tokenize_fn(text)
        x = self.vectorizer_fn(tokens)
        
        x = torch.from_numpy(x)
        if self.sparse:
            x = x.to_sparse()
        
        if label is None:
            return x
        else:
            return x, label

In [17]:
ds = TextDataset("../data/raw/train.csv",
                 preprocess_fn=preprocess,
                 tokenize_fn=tokenize_fn,
                 vectorizer_fn=embedding_fn)

In [18]:
ds[0]

(tensor([-3.0769e-02,  1.1993e-01,  5.3909e-01, -4.3696e-01, -7.3937e-01,
         -1.5345e-01,  8.1126e-02, -3.8559e-01, -6.8797e-01, -4.1632e-01,
         -1.3183e-01, -2.4922e-01,  4.4100e-01,  8.5919e-02,  2.0871e-01,
         -6.3582e-02,  6.2228e-02, -5.1234e-02, -1.3398e-01,  1.1418e+00,
          3.6526e-02,  4.9029e-01, -2.4567e-01, -4.1200e-01,  1.2349e-01,
          4.1336e-01, -4.8397e-01, -5.4243e-01, -2.7787e-01, -2.6015e-01,
         -3.8485e-01,  7.8656e-01,  1.0230e-01, -2.0712e-01,  4.0751e-01,
          3.2026e-01, -5.1052e-01,  4.8362e-01, -9.9498e-03, -3.8685e-01,
          3.4975e-02, -1.6700e-01,  4.2370e-01, -5.4164e-01, -3.0323e-01,
         -3.6983e-01,  8.2836e-02, -5.2538e-01, -6.4531e-02, -1.3980e+00,
         -1.4873e-01, -3.5327e-01, -1.1180e-01,  1.0912e+00,  9.5864e-02,
         -2.8129e+00,  4.5238e-01,  4.6213e-01,  1.6012e+00, -2.0837e-01,
         -2.7377e-01,  7.1197e-01, -1.0754e+00, -4.6974e-02,  6.7479e-01,
         -6.5839e-02,  7.5824e-01,  3.