In [66]:
# import librarties 
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import nltk 
import spacy
import string
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook

In [43]:
random.seed(134)
PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

# Data Processing

### Build vocab from pretrained word embeddings

In [109]:
# build vocab from fasttext embeddings 

def build_vocab(word2vec_source, max_vocab_size): 
    """ Takes pretrained word2vec source path, limits to max_vocab_size, and returns:  
        - id2token: list of tokens, where id2token[i] returns token that corresponds to i-th token 
        - token2id: dictionary where keys represent tokens and corresponding values represent their indices 
        - word_emb: dictionary representing word embeddings 
    """
    word2vec_model = KeyedVectors.load_word2vec_format(word2vec_source, limit=max_vocab_size)
    id2token = word2vec_model.index2word
    token2id = dict(zip(id2token, range(2, 2+len(id2token))))
    id2token = ['<pad>', '<unk>'] + id2token 
    word_emb = {token2id[w]: word2vec_model[w] for w in word2vec_model.vocab}

    return id2token, token2id, word_emb 

In [107]:
id2token, token2id, word_emb = build_vocab('fasttext_word2vec/wiki-news-300d-1M.vec', 50000)

In [63]:
train = pd.read_table('hw2_data/snli_train.tsv')
train.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [64]:
train_sent1 = train['sentence1'].tolist()
train_sent2 = train['sentence2'].tolist() 
# train_sent1_tokens = tokenize_dataset(train_sent1, lower_case_remove_punc) 
# train_sent2_tokens = tokenize_dataset(train_sent2, lower_case_remove_punc) 
train_sent1_tokens = [sent.split() for sent in train_sent1]
train_sent2_tokens = [sent.split() for sent in train_sent2]
all_train_tokens = [token for sent in train_sent1_tokens for token in sent] + [token for sent in train_sent2_tokens for token in sent] 
len(all_train_tokens)

2228065

In [65]:
token2id, id2token = build_vocab(all_train_tokens, max_vocab_size = 10000)

In [84]:
ft_word2vec['the']

array([ 8.970e-02,  1.600e-02, -5.710e-02,  4.050e-02, -6.960e-02,
       -1.237e-01,  3.010e-02,  2.480e-02, -3.030e-02,  1.740e-02,
        6.300e-03,  1.840e-02,  2.170e-02, -2.570e-02,  3.500e-02,
       -2.420e-02,  2.900e-03,  1.880e-02, -5.700e-02,  2.520e-02,
       -2.100e-02, -8.000e-04,  3.600e-02, -7.290e-02, -6.650e-02,
        9.890e-02,  6.760e-02,  8.520e-02, -8.900e-03,  3.130e-02,
       -6.900e-03, -3.200e-03, -4.620e-02,  4.970e-02,  2.610e-02,
        2.680e-02, -3.100e-02, -1.361e-01, -6.200e-03,  3.750e-02,
       -3.200e-02, -1.060e-02,  5.340e-02, -1.870e-02,  6.380e-02,
        9.400e-03,  4.700e-03, -5.300e-02,  9.300e-03, -8.700e-03,
        4.000e-04,  4.930e-02, -6.296e-01,  2.220e-02,  1.900e-02,
        2.680e-02, -4.260e-02,  5.700e-03, -1.683e-01,  2.440e-02,
       -2.130e-02, -1.810e-02,  4.210e-02, -3.090e-02, -8.900e-03,
        3.200e-03,  1.080e-02, -4.900e-03,  2.580e-02,  2.780e-02,
       -1.630e-02,  2.000e-02,  1.640e-02, -9.540e-02, -3.200e

In [76]:
ft_words = []
for word in ft_word2vec.vocab:
    print(ft_word2vec[word])
    break 

[ 1.0730e-01  8.9000e-03  6.0000e-04  5.5000e-03 -6.4600e-02 -6.0000e-02
  4.5000e-02 -1.3300e-02 -3.5700e-02  4.3000e-02 -3.5600e-02 -3.2000e-03
  7.3000e-03 -1.0000e-04  2.5800e-02 -1.6600e-02  7.5000e-03  6.8600e-02
  3.9200e-02  7.5300e-02  1.1500e-02 -8.7000e-03  4.2100e-02  2.6500e-02
 -6.0100e-02  2.4200e-01  1.9900e-02 -7.3900e-02 -3.1000e-03 -2.6300e-02
 -6.2000e-03  1.6800e-02 -3.5700e-02 -2.4900e-02  1.9000e-02 -1.8400e-02
 -5.3700e-02  1.4200e-01  6.0000e-02  2.2600e-02 -3.8000e-03 -6.7500e-02
 -3.6000e-03 -8.0000e-03  5.7000e-02  2.0800e-02  2.2300e-02 -2.5600e-02
 -1.5300e-02  2.2000e-03 -4.8200e-02  1.3100e-02 -6.0160e-01 -8.8000e-03
  1.0600e-02  2.2900e-02  3.3600e-02  7.1000e-03  8.8700e-02  2.3700e-02
 -2.9000e-02 -4.0500e-02 -1.2500e-02  1.4700e-02  4.7500e-02  6.4700e-02
  4.7400e-02  1.9900e-02  4.0800e-02  3.2200e-02  3.6000e-03  3.5000e-02
 -7.2300e-02 -3.0500e-02  1.8400e-02 -2.6000e-03  2.4000e-02 -1.6000e-02
 -3.0800e-02  4.3400e-02  1.4700e-02 -4.5700e-02 -2

In [73]:
ft_words[:5]

[',', 'the', '.', 'and', 'of']