In [66]:
# import librarties 
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import nltk 
import spacy
import string
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook

In [43]:
random.seed(134)
PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

# Data Processing

### Build vocab from pretrained word embeddings

In [109]:
# build vocab from fasttext embeddings 

def build_vocab(word2vec_source, max_vocab_size): 
    """ Takes pretrained word2vec source path, limits to max_vocab_size, and returns:  
        - id2token: list of tokens, where id2token[i] returns token that corresponds to i-th token 
        - token2id: dictionary where keys represent tokens and corresponding values represent their indices 
        - word_emb: dictionary representing word embeddings 
    """
    word2vec_model = KeyedVectors.load_word2vec_format(word2vec_source, limit=max_vocab_size)
    id2token = word2vec_model.index2word
    token2id = dict(zip(id2token, range(2, 2+len(id2token))))
    id2token = ['<pad>', '<unk>'] + id2token 
    word_emb = {token2id[w]: word2vec_model[w] for w in word2vec_model.vocab}

    return id2token, token2id, word_emb 

In [107]:
id2token, token2id, word_emb = build_vocab('fasttext_word2vec/wiki-news-300d-1M.vec', 50000)

### Load data and convert to indices

In [114]:
# load data into pandas dataframe 
snli_train = pd.read_table('hw2_data/snli_train.tsv')
snli_val = pd.read_table('hw2_data/snli_val.tsv')
mnli_train = pd.read_table('hw2_data/mnli_train.tsv')
mnli_val = pd.read_table('hw2_data/mnli_val.tsv')
print(len(snli_train), len(snli_val), len(mnli_train), len(mnli_val))

100000 1000 20000 5000


In [115]:
snli_train.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [143]:
# helper methods to convert pandas df to lists of word indices and labels 

label_dict = {'neutral': 0, 'entailment': 1, 'contradiction': -1}

def token_to_index_datum(datum_tokens, token2id): 
    """ Converts a list of tokens and converts it to a list of token indices for one datum """ 
    index_list = [token2id[token] if token in token2id else UNK_IDX for token in datum_tokens]
    return index_list 

def df_to_list(data_df, token2id): 
    """ Takes train/val data as pandas df and returns: 
        - list of lists of word indices representing first sentence 
        - list of lists of word indices representing second sentence 
        - list of ground truth labels indicating entailment/contradiction/neutrality of two sentences 
    """
    sent1 = [token_to_index_datum(sent.split(), token2id) for sent in data_df['sentence1'].tolist()] 
    sent2 = [token_to_index_datum(sent.split(), token2id) for sent in data_df['sentence2'].tolist()] 
    labels = [label_dict[label] for label in data_df['label'].tolist()] 
    return sent1, sent2, labels

In [144]:
# convert data to indices 
snli_train_sent1, snli_train_sent2, snli_train_labels = df_to_list(snli_train, token2id)
snli_val_sent1, snli_val_sent2, snli_val_labels = df_to_list(snli_val, token2id)
mnli_train_sent1, mnli_train_sent2, mnli_train_labels = df_to_list(mnli_train, token2id)
mnli_val_sent1, mnli_val_sent2, mnli_val_labels = df_to_list(mnli_val, token2id)