In [16]:
import csv
import json
import os
import requests
import io
import numpy as np
from zipfile import ZipFile

In [14]:
DATASET_DIR = '../datafiles'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'

### EXTRACTION OF QUESSTION PAIR DATA

In [3]:
def makedataset(filepath, delimiter, trainfile = False):
    print('Preprocessing {} file'.format(filepath))
    question1 = []
    question2 = []
    is_duplicate = []
    with open(filepath, encoding='utf-8') as questionfile:
        reader = csv.DictReader(questionfile, delimiter = delimiter)
        for row in reader:
            question1.append(row['question1'])
            question2.append(row['question2'])
            
            if trainfile:
                is_duplicate.append(row['is_duplicate'])
    questionfile.close()
    
    print('Processed number of Questions : {}'.format(len(question1) + len(question2)))
    return question1, question2, is_duplicate

In [4]:
question1, question2, is_duplicate = makedataset(os.path.join(DATASET_DIR, 'train.csv'), delimiter = ',')

Preprocessing ../datafiles\train.csv file
Processed number of Questions : 808580


In [5]:
import nltk

In [6]:
class vocab_builder():
    def __init__(self, sentences):
        self.word2ind = {}
        self.ind2word = {}
        self.word_bag = []
        self.index = 0
        self.build_vocab(sentences)
    
    def build_vocab(self, sentences):
        tokens = []
        for sentence in sentences:
            words = nltk.tokenize.word_tokenize(sentence.lower())
            tokens.extend(words)
            
        for word in tokens:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2ind:
            self.word2ind[word] = self.index
            self.ind2word[self.index] = word
            self.index += 1
            self.word_bag.append(word)
            
    def get_index(self, word):
        if word in self.word2ind:
            return self.word2ind[word]
        else:
            print('Word not in Vocabulary')
    
    def get_word(self, index):
        return self.ind2word[index]
    
    def get_sentence(self, index_list):
        sentence = ''
        for curr_ind in index_list:
            curr_word = self.get_word(curr_ind)
            sentence += " " + curr_word
        return sentence
    
    def get_tokenizers(self):
        return self.word_bag
    
    def text_to_index(self, sentences):
        sent2ind = []
        for sent in sentences:
            words = nltk.tokenize.word_tokenize(sent.lower())
            curr_sent_index = [self.get_index(word) for word in words]
            sent2ind.append(curr_sent_index)
        return sent2ind

### Building tokenized Word Index

In [7]:
questions = question1 + question2
vocab = vocab_builder(questions)

In [8]:
question1_word_tokenizer = vocab.text_to_index(question1)
question2_word_tokenizer = vocab.text_to_index(question2)

In [9]:
question1_word_tokenizer[0]

[0, 1, 2, 3, 4, 3, 5, 6, 7, 8, 9, 10, 8, 11, 12]

In [10]:
len(question1_word_tokenizer[0])

15

In [11]:
question1_word_tokenizer[1]

[0, 1, 2, 13, 14, 15, 16, 17, 18, 19, 12]

In [12]:
len(question1_word_tokenizer[1])

11

In [13]:
dict(list(vocab.word2ind.items())[0:5])

{'what': 0, 'is': 1, 'the': 2, 'step': 3, 'by': 4}

### Downloading and Processing of GloVe Embeddings

In [21]:
def gloveEmbedding(filepath, filename, fileURL=None):
    if not os.path.exists(os.path.join(filepath, filename)):
        if fileURL:
            print('-' * 100)
            print('File Not Exists!! Downloading it from the Server.....')
            req = requests.get(fileURL)
            print('File Downloaded! Extracting it .....')
            zipcontent = ZipFile(io.BytesIO(req.content))
            zipcontent.extractall()
            print('-' * 100)
        else:
            print('No Path Specified')
    print('Processing the GLoVE File .....')
    print('-' * 100)
    embeddings_index = {}
    
    with open(os.path.join(filepath, filename), encoding = 'utf-8') as encoding_file:
        for line in encoding_file:
            value = line.split(' ')
            word = value[0]
            embedding_val = np.asarray(value[1:], dtype = np.float32)
            embeddings_index[word] = embedding_val
    encoding_file.close()
    print('Length of Word Embedding : {}'.format(len(embeddings_index)))
    print('-' * 100)
    return embeddings_index    

In [None]:
gloveEmbedding(os.path.join(DATASET_DIR, GLOVE_FILE), GLOVE_ZIP_FILE_URL)