In [13]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 18 15:06:40 2017

@author: nileshbhoyar
"""
import numpy as np
import pandas as pd
import nltk
import random
import sys
import itertools
from collections import defaultdict
import pickle

EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''
MAX_REVIEWS = 10
FILENAME = '/Users/nileshbhoyar/Documents/W266Project/data/finefoods.txt'

limit = {
        'maxreview' : 200,
        'minreview' : 0,
        'maxsummary' : 100,
        'minsummary' : 3
        }
UNK = 'unk'
VOCAB_SIZE = 20000000
##
def __crawl_review(raw_data_file):
    
        """
        Crawl review
        :return: review [numpy array]
        """
        review_list = []
        print 'Crawling Reviews....'
        num_lines = 0
        with open(raw_data_file) as infile:
            for line in infile:
                if line.startswith('review/text'):
                    if num_lines >= MAX_REVIEWS:
                        break
                    num_lines += 1
                    _,review = line.split('/text: ')
                    review_list.append(review)
        return np.array(review_list)
    
    
def __crawl_summary(raw_data_file):
        """
        Crawl summary
        :return: summary [numpy array]
        """
        summary_list = []
        print 'Crawling Summary....'
        num_lines = 0
        with open(raw_data_file) as infile:
            for line in infile:
                if line.startswith('review/summary'):
                    if num_lines >= MAX_REVIEWS:
                        break
                    num_lines += 1
                    _,summary = line.split('/summary: ')
                    summary_list.append(summary)
        return np.array(summary_list)
    
#tokenize sentenses here both review + summary
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
        freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        # get vocabulary of 'vocab_size' most used words
        vocab = freq_dist.most_common(vocab_size)
        # index2word
        index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
        word2index = dict([(w,i) for i,w in enumerate(index2word)] )
        return index2word, word2index, freq_dist

def pad_seq(seq, lookup, maxlen):
       indices = []
       for word in seq:
           if word in lookup:
               indices.append(lookup[word])
           else:
               indices.append(lookup[UNK])
       return indices + [0]*(maxlen - len(seq))
#zero pad
def zero_pad(qtokenized, atokenized, w2idx):
    # num of rows
        data_len = len(qtokenized)

    # numpy arrays to store indices
        idx_review = np.zeros([data_len, limit['maxreview']], dtype=np.int32) 
        idx_summary = np.zeros([data_len, limit['maxsummary']], dtype=np.int32)

        for i in range(data_len):
            q_indices = pad_seq(qtokenized[i], w2idx, limit['maxreview'])
            a_indices = pad_seq(atokenized[i], w2idx, limit['maxsummary'])

        #print(len(idx_q[i]), len(q_indices))
        #print(len(idx_a[i]), len(a_indices))
            idx_review[i] = np.array(q_indices)
            idx_summary[i] = np.array(a_indices)

        return idx_review, idx_summary



In [14]:
df = pd.DataFrame()
df['Review'] = __crawl_review(FILENAME)
df['Summary'] =__crawl_summary(FILENAME)
qtokenized = [ wordlist.split(' ') for wordlist in df['Review'] ]
atokenized = [ wordlist.split(' ') for wordlist in df['Summary'] ]
print('\n >> Index words')
idx2w, w2idx, freq_dist = index_( qtokenized + atokenized, vocab_size=VOCAB_SIZE)
print('\n >> Zero Padding')
idx_q, idx_a = zero_pad(qtokenized, atokenized, w2idx)
print idx_q[1]

    

Crawling Reviews....
Crawling Summary....

 >> Index words

 >> Zero Padding
[254 270 231  16 111 276 273  90  46  86  39 124 138  23 265  61   5   9
 246 235 184  61   7  97 266  17  65   7  31  16 245   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [17]:
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)

In [18]:
def load_data(PATH=''):
    # read data control dictionaries
    with open(PATH + '/Users/nileshbhoyar/Documents/W266Project/datasets/metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    # read numpy arrays
    idx_q = np.load(PATH + '/Users/nileshbhoyar/Documents/W266Project/datasets/idx_review.npy')
    idx_a = np.load(PATH + '/Users/nileshbhoyar/Documents/W266Project/datasets/idx_summary.npy')
    return metadata, idx_q, idx_a


In [21]:
metadata, idx_q, idx_a = load_data()
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a)

In [22]:
trainX.shape

(7, 200)