In [17]:
# 0. import packages
import spacy
import os
from collections import Counter
import glob

from spacy import attrs
import numpy as np
vocab_size = 50000
batch_size = 1000

In [18]:
# 1. load nlp model & files to read
nlp = spacy.load('en') # loads default English object
cnn_dir = '../cnn_stories_tokenized/'
cnn_pre_dir = '../cnn_stories_final/'
file_list = [os.path.join(cnn_dir,file) for file in os.listdir(cnn_dir)]

In [19]:
import numpy as np
import spacy
import torch
from torch.autograd import Variable
import os
from collections import Counter
import torch
import glob
from spacy import attrs


In [20]:

def parse_cnn(file_dir, nlp):
	with open(file_dir,encoding='utf-8') as f:
		text = f.read()
		text = text.lower()
		text = text.replace('\n\n',' ')
		text = text.split("@highlight")
		body = nlp(text[0])
		body_words = [x.text for x in list(body)]
		summaries = text[1:]
		summaries = ' '.join([x+'.' for x in summaries])
		summaries = nlp(summaries)
		summary_words = [x.text for x in list(summaries)]
		return body_words, summary_words

def word_list_to_idx_list(word_list, word2idx, vocab_size):
	out = []
	oov2idx = dict()
	oov_words = []
	for word in word_list:
		try:
			out.append(word2idx[word])
		except KeyError:
			if word not in oov2idx:
				oov2idx[word]=vocab_size+len(oov2idx)
			out.append(oov2idx[word])
	return out

def calc_running_avg_loss(loss, running_avg_loss, step, decay=0.99):
	if running_avg_loss==0:
		running_avg_loss = loss
	else:
		running_avg_loss = running_avg_loss * decay + (1-decay) * loss
	running_avg_loss = min(running_avg_loss,12) # clip
	return running_avg_loss

def to_cuda(item):
	if torch.cuda.is_available():
		return item.cuda()
	else:
		return item

def num_to_var(item):
	# numpy array to Variable
	if item.dtype==int:
		out = Variable(torch.LongTensor(item))
	else:
		out = Variable(torch.Tensor(item))
	return to_cuda(out)
		
body_list = []
summary_list = []
counter = Counter()
batch_no = 0


In [None]:
while batch_no<len(file_list):
    batch = file_list[batch_no:min(batch_no + batch_size,len(file_list))]
    count = 0
    for file in batch:
        print(count/len(batch),end="\r")
        count+=1
        body_words, summary_words = parse_cnn(file,nlp)
        body_list.extend(body_words)
        summary_list.extend(summary_words)
    print("summary created")
    c = Counter(body_list+summary_list)
    counter = counter + c
    vocab_list = counter.most_common(vocab_size)
    print("most common")
    word2idx = dict()
    word2idx['<PAD>']=0
    word2idx['<S>']=1
    word2idx['</S>']=2
    word2idx['<UNK>']=3
    idx2word = dict()
    idx2word[0] = '<PAD>'
    idx2word[1] = '<S>'
    idx2word[2] = '</S>'
    idx2word[3] = '<UNK>'
    for i,(word,_) in enumerate(vocab_list):
        if len(word2idx)>vocab_size:
            break
        word2idx[word] = i+4
        idx2word[i+4] = word
    np.save('word2idx.npy',word2idx)
    np.save('idx2word.npy',idx2word)
    batch_no+=batch_size
    print("Vocabulary created from %d/%d files, top %d words saved" 
          %(batch_no,len(file_list),len(word2idx)))

summary created
most common
Vocabulary created from 1000/92579 files, top 32730 words saved
summary created
most common
Vocabulary created from 2000/92579 files, top 45516 words saved
summary created
most common
Vocabulary created from 3000/92579 files, top 50001 words saved
summary created
most common
Vocabulary created from 4000/92579 files, top 50001 words saved
summary created
most common
Vocabulary created from 5000/92579 files, top 50001 words saved
0.995

In [28]:
w2i = np.load('word2idx.npy').item()
i2w = np.load('idx2word.npy').item()
v = len(w2i)
# 3. preprocess each document in CNN so that we get a form where a text is seen in vectors
out_file_list = [os.path.join(cnn_pre_dir,file) for file in os.listdir(cnn_dir)]
in_out_zip = zip(file_list, out_file_list)
cnt = 0
for in_file, out_file in in_out_zip:
    body_words, summary_words = parse_cnn(in_file, nlp)
    body_idx = word_list_to_idx_list(body_words, w2i, v)
    body_idx = [str(x) for x in body_idx]
    summary_idx = word_list_to_idx_list(summary_words,w2i,v)
    summary_idx = [str(x) for x in summary_idx]
    out = ' '.join(body_idx)+"::"+' '.join(summary_idx)
    with open(out_file,'w') as f:
        f.write(out)
    cnt+=1
    if cnt%1000==0:
        print('%d files processed so far' %(cnt))

1000 files processed so far
2000 files processed so far
3000 files processed so far
4000 files processed so far
5000 files processed so far
6000 files processed so far
7000 files processed so far
8000 files processed so far
9000 files processed so far
10000 files processed so far
11000 files processed so far
12000 files processed so far
13000 files processed so far
14000 files processed so far
15000 files processed so far
16000 files processed so far
17000 files processed so far
18000 files processed so far
19000 files processed so far
20000 files processed so far
21000 files processed so far
22000 files processed so far
23000 files processed so far
24000 files processed so far
25000 files processed so far
26000 files processed so far
27000 files processed so far
28000 files processed so far
29000 files processed so far
30000 files processed so far
31000 files processed so far
32000 files processed so far
33000 files processed so far
34000 files processed so far
35000 files processed s

In [30]:
summary_words

[' ',
 'a',
 'south',
 'korean',
 'official',
 'says',
 'jill',
 'kelley',
 "'s",
 'use',
 'of',
 'her',
 'honorary',
 'title',
 'was',
 '"',
 'not',
 'suitable',
 '"',
 '.',
 ' ',
 'a',
 'new',
 'york',
 'businessman',
 'accused',
 'her',
 'of',
 'using',
 'that',
 'designation',
 'to',
 'solicit',
 'business',
 '.',
 ' ',
 'kelley',
 "'s",
 'complaint',
 'about',
 'harassing',
 'e',
 '-',
 'mails',
 'led',
 'to',
 'the',
 'resignation',
 'of',
 'cia',
 'chief',
 'david',
 'petraeus',
 '.']

In [25]:
summary_words

[' ',
 'tv',
 'personality',
 'star',
 'jones',
 'was',
 'diagnosed',
 'with',
 'heart',
 'disease',
 'in',
 '2010',
 '.',
 ' ',
 'heart',
 'disease',
 'is',
 'the',
 'leading',
 'cause',
 'of',
 'death',
 'for',
 'men',
 'and',
 'women',
 ',',
 'but',
 'it',
 "'s",
 'preventable',
 '.',
 ' ',
 'february',
 'is',
 'american',
 'heart',
 'month',
 ',',
 'and',
 'friday',
 'is',
 'national',
 'wear',
 'red',
 'day',
 '.']

In [8]:
tmp = []
for i,j in a:
    tmp.append(i)

In [None]:
with open(file_list[0]) as f:
    text = f.read()
    text = text.lower()
    text = text.replace('\n\n',' ')
    text = text.replace('(cnn)','')
    text = text.split("@highlight")
    body = text[0]
    body_tokens = nlp(body)
    summaries = text[1:]
    summary_tokens = nlp(' '.join([x.strip()+'.' for x in summaries])+'.')

In [None]:
w2i = dict()
w2i['<PAD>']=0
w2i['<S>']=1
w2i['</S>']=2

i2w = dict()
i2w[0]='<PAD>'
i2w[1]='<S>'
i2w[2]='</S>'

for i,word in enumerate(word2idx):
    if len(w2i)>50000:
        break
    w2i[word] = i+3
    i2w[i+3] = word

In [None]:
i2w

In [None]:
def nlp_to_tokens(token_list,word2idx):
    out = []
    oov2idx = dict()
    oov_idx = 0
    for token in token_list:
        word = token.text
        try:
            out.append(word2idx[word])
        except KeyError:
            if word not in oov2idx:
                oov_idx+=1
                oov2idx[word]=vocab_size+oov_idx
            out.append(oov2idx[word])
    return out, oov2idx

In [None]:
out, oov2idx = nlp_to_tokens(list(body_tokens),word2idx)

In [None]:
out

In [None]:
word2idx[l[1].text]

In [None]:
doc = nlp(body)
lst = list(doc)

In [None]:
words = list(set(lst))

In [None]:
out= []
oov_dict = dict()
for x in words:
    try:
        out.append(word2idx[x])
    except KeyError:
        oov_dict[x]

In [None]:
word2idx['oifdjherht']

In [None]:
c.most_common(300)[0][0]

In [None]:
import numpy as np
a = list(np.arange(32))

In [None]:
i=0
while (i<10):
    i+=1
    print(i)

In [None]:
word_list = []
i = 0
for file_name in file_list:
    with open(file_name) as f:
        text = f.read()
        text = text.lower()
        text = text.replace('\n\n',' ')
        text = text.replace('(cnn)','')
        text = text.split("@highlight")
        body = text[0]
        doc = list(nlp(body))
        word_list.extend([x.text for x in doc])
    if i%1000==0:
        print(i)
    i+=1

In [None]:
c = c + Counter(['a','b','a','b'])

In [None]:
c.most_common(100)

In [None]:
len(list(set(word_list)))

In [None]:
from torch import nn
import numpy as np
from torch.autograd import Variable
a = Variable(torch.LongTensor(np.arange(40).reshape(4,10)))
emb = nn.Embedding(40,20)

In [None]:
help(nn.LSTM)

In [None]:
lstm = nn.LSTM(hidden_size=100,input_size=20, batch_first=True)

In [None]:
A=np.arange(24).reshape(4,6)
A=A*(-1)
A=A+15
A = np.maximum(A,0)
A = Variable(torch.LongTensor(A))

In [None]:
B = A==0
B.float().data

In [None]:
c = Variable(torch.Tensor(1,4,100))
out=lstm(emb(a[:,0].unsqueeze(1)), (c,c))

In [None]:
out[0].size()

In [None]:
emb(a).size()

In [None]:
c = Counter(['a','a','a','a','a','b'])

In [None]:
import numpy as np
import spacy
import os
from collections import Counter
import torch
import glob
from spacy import attrs


word2idx = np.load('word2idx.npy').item()
vocab_size = len(word2idx)
batch_size = 1000

nlp = spacy.load('en') # loads default English object
cnn_dir = '/home/mjc/datasets/CNN_DailyMail/cnn/stories/'
cnn_pre_dir = '/home/mjc/datasets/CNN_DailyMail/cnn/preprocessed_stories/'

file_list = [os.path.join(cnn_dir,file) for file in os.listdir(cnn_dir)]
total_files = len(file_list)
files_read = 0
count = 0
for file in file_list[0:1]:
    with open(file) as f:
        text = f.read()
        print(text)
        text = text.lower()
        text = text.replace('\n\n',' ')
        text = text.replace('(cnn)','')
        text = text.split("@highlight")
        body = text[0]
        body_words = body.split(' ')
        summaries = ' . '.join(text[1:])+' .'
        summary_words = summaries.split(' ')
        unique_words = list(set(body_words+summary_words))
        temp_dict = dict()
        oovs = 0
        for w in unique_words:
            try:
                temp_dict[w] = word2idx[w]
            except KeyError:
                oovs+=1
                temp_dict[w] = oovs+vocab_size
        body_idx = [str(temp_dict[x]) for x in body_words]
        summary_idx = [str(temp_dict[x]) for x in summary_words]
        out = ' '.join(body_idx)+'::'+' '.join(summary_idx)
        out_file = file.replace('/stories/','/preprocessed_stories/')
    with open(out_file,'w') as f:
        f.write(out)
    count+=1
    if count%100==0:
        print(count)


# 		doc = nlp(text)


# counter = Counter()
# while (files_read<total_files):
#     word_list = []
#     batch_files = file_list[files_read:min(files_read+1000,total_files)]
#     for file_name in batch_files:
#         with open(file_name) as f:
#             text = f.read()
#             text = text.lower()
#             text = text.replace('\n\n',' ')
#             text = text.replace('(cnn)','')
#             text = text.split("@highlight")
#             body = text[0]
#             doc = list(nlp(body))
#             word_list.extend([x.text for x in doc])

#     counter = counter + Counter(word_list)
#     files_read+=len(batch_files)
#     print("%d files read so far..." % files_read)
#     word2idx = {tup[0]: i for i,tup in enumerate(counter.most_common(vocab_size))}
#     np.save('word2idx.npy',word2idx)
# print("All merged!")
# word2idx = {tup[0]: i for i,tup in enumerate(counter.most_common(vocab_size))}
# np.save('word2idx.npy',word2idx)