In [16]:
import torch
import torch.nn as nn
import numpy as np
# from skimage import io, transform
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import unicodedata # ??
import nltk
from nltk.tokenize import TweetTokenizer
import csv
import json
from torchvision import transforms
from torch.autograd import Variable
np.random.seed(1)
random.seed(1)


# define directory structure needed for data processing
RAW_DATA_DIR = os.path.join('..', 'data/', 'raw_data/')
FORMAL_DATA_DIR = os.path.join('..', 'data/', 'formal_data/')
UNKNOWN_TOKEN = "unk"

## Split data into `train`, `val` and `test`
Split and write raw data as `acsii` format. 

In [18]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.strip()).replace("\t", " ")
    return s


# precondition: two fields with name: "headline" and "text"
def splitData(fname, test_size=0.2, val_size=0.2): 
    df = pd.read_csv('../data/' + fname, encoding='latin-1')
    df = df[["headlines", "text"]] # summary text, not the entire article
    df["headlines"] = df["headlines"].apply(normalizeString)
    df["text"] = df["text"].apply(normalizeString)

    df = df.sample(frac=1).reset_index(drop=True) # shuffle data

    df_train, df_test = train_test_split(df, test_size=test_size, random_state=1)
    df_train, df_val = train_test_split(df_train, test_size=val_size, random_state=1)
    df_train.to_csv(RAW_DATA_DIR + "train.csv", index=False, sep="\t", header=False)
    df_val.to_csv(RAW_DATA_DIR + "val.csv", index=False, sep="\t", header=False)
    df_test.to_csv(RAW_DATA_DIR + "test.csv", index=False, sep="\t", header=False)
    
# # TODO: check whether "text" is in fact the summary and corresponds to the headline

splitData("news_summary.csv")

ModuleNotFoundError: No module named 'pandas.io.formats.csvs'

## Preprocess `raw_data` to `formal_data`

In [3]:
# class to store string transformation
class Transform(object):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    @staticmethod
    def word_tokenize(s): 
        return json.dumps(Transform.tokenizer.tokenize(s))
    
    @staticmethod
    def cap(s): 
        return s.upper()
    
# tf: a transformation apply to each individual headline and text
# Apply a transformation to the dataframe in "raw_data" 
# Results of the transformation is in "formal_data"
def preprocess(fname, tf, chunksize=1000, num_chunk=-1): 
    header = True
    mode = "w"
    i = 0
    for df in pd.read_csv(RAW_DATA_DIR + fname, sep="\t", header = 0, chunksize=chunksize):
        df["headlines"] = df["headlines"].apply(tf)
        df["text"] = df["text"].apply(tf)
        df.to_csv(FORMAL_DATA_DIR+fname, columns = ['headlines','text'], mode=mode,index=False, header=header, sep="\t")
        if header == True:  
            # no header, and write in append mode from the 2nd chunk
            header = False; 
            mode = "a"
        
        i = i + 1 if i >= 0 else -1
        if i == num_chunk: 
            break; 
    

# preprocess("train.csv", tf=Transform.word_tokenize)
# preprocess("val.csv", tf=Transform.word_tokenize)
# preprocess("test.csv", tf=Transform.word_tokenize)

## Language model from train data

In [4]:
class GloVe():
    def __init__(self, path, dim):
        self.dim = dim
        self.word_embedding_dict = {}
        with open(path) as f:
            for line in f:
                values = line.split()
                embedding = values[-dim:]
                word = ''.join(values[:-dim])
                self.word_embedding_dict[word] = np.asarray(embedding, dtype=np.float32)
    
    def get_word_vector(self, word):
        if word not in self.word_embedding_dict.keys():
            embedding = np.random.uniform(low=-1, high=1, size=self.dim).astype(np.float32)
            self.word_embedding_dict[word] = embedding
            return embedding
        else:
            return self.word_embedding_dict[word]

In [5]:
glvmodel = GloVe(os.path.join('..', 'models', 'glove.twitter.27B.200d.txt'), dim=EMBEDDING_DIM)

In [6]:

# Modified from: Sean Robertson <https://github.com/spro/practical-pytorch>
class Lang: # language model
    def __init__(self, glvmodel, fname=None):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.gloveEmbed = []
        self.glvmodel = glvmodel
        self.size = 0  
        self.embed_dim = EMBEDDING_DIM
        
        self.addWord("<PAD>")
        self.addWord("<SOS>")
        self.addWord("<EOS>")
        
        self.addWord(UNKNOWN_TOKEN)
        if fname is not None: 
            self.addCSV(fname)
        
    # add words of a sentence into the language model
    # split by ' '
    def addSentence(self, sentence):
        for word in sentence: 
            self.addWord(word)
            
    def addGlove(self, glove): 
        self.glove = glove

    def getGloveLayer(self): 
        # Initialize word embeddings from our pre-training GloVe embeddings
        glove_embeddings = torch.from_numpy(np.asarray(self.gloveEmbed))
        return nn.Embedding(self.size, self.embed_dim).from_pretrained(glove_embeddings, freeze=False)
        
    # add a word to language model
    def addWord(self, word):
        if word not in self.word2index:
            # assign id for word
            self.word2index[word] = self.size
            
            # count word
            self.word2count[word] = 1
            
            self.index2word[self.size] = word
            self.size += 1
            
            self.gloveEmbed.append(glvmodel.get_word_vector(word))
        else:
            self.word2count[word] += 1
            
    def addDataFrame(self, df): 
        # df: headline, text
        for index, row in df.iterrows():
            self.addSentence(json.loads(row['headlines']))
            self.addSentence(json.loads(row['text']))
            
    def addCSV(self, fname): 
        # construct language model based on a file
        # file is a dataframe csv file with "headlines" and "text"
        for df in pd.read_csv(FORMAL_DATA_DIR + fname, sep="\t", header = 0, chunksize=2000):
            self.addDataFrame(df)     
    
    def size(self): 
        return len(self.word2index)
    
    def wordSeq2IdxSeq(self, word_seqs):
        default_idx = lang.word2index[UNKNOWN_TOKEN]
        idxs = []
        for word_seq in word_seqs:
            idxs.append([lang.word2index.get(w, default_idx) for w in word_seq])
#             idxs.append(torch.LongTensor([lang.word2index.get(w, default_idx) for w in word_seq]))
        return idxs

lang = Lang([])
lang.addCSV("train.csv")

## Neural Network model

In [7]:
def init_param(params, bias_std=0.0, weight_std=0.05): 
    for name, param in params:
        if 'bias' in name:
            nn.init.constant_(param, bias_std)
        elif 'weight' in name:
            nn.init.normal_(param, std = weight_std)
            
# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
#         super(EncoderRNN, self).__init__()
        
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.n_layers = n_layers
#         self.dropout = dropout
        
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
        
#     def forward(self, input_seqs, input_lengths, hidden=None):
#         # Note: we run this all at once (over multiple batches of multiple sequences)
#         embedded = self.embedding(input_seqs)
#         packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
#         outputs, hidden = self.gru(packed, hidden)
#         outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
#         outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
#         return outputs, hidden
    
class Encoder(nn.Module):
    def __init__(self, hidden_dim, batch_size, num_layers, lang):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.word_embeddings = lang.getGloveLayer()
        
        # Initialize a Gated Recurrent Unit RNN
        self.gru = nn.GRU(input_size=lang.embed_dim, hidden_size=hidden_dim, num_layers=num_layers)
        
#         self.hidden2label = nn.Linear(hidden_dim, n_classes)
        self.hidden = self.init_hidden()
        
        # Custom initialization of the weights and biases
        init_param(self.gru.named_parameters())
#         init_param(self.hidden2label.named_parameters())


    def init_hidden(self):
        return Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_dim).cuda())

    
    def forward(self, input_seqs):
        
        embedded = self.word_embeddings(input_seqs)
        packed_embeds = nn.utils.rnn.PackedSequence(input_embed, packed_sequence.batch_sizes)
        output, hidden = self.gru(packed_embeds, self.hidden)
        return output, hidden
    
#     def forward(self, input_seqs, input_lengths, hidden=None):
#         # Note: we run this all at once (over multiple batches of multiple sequences)
#         embedded = self.word_embeddings(input_seqs)
#         packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
#         outputs, hidden = self.gru(packed, hidden)
#         outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
#         outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
#         return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        
        self.gru = nn.GRU(hidden_size, hidden_size)
        
        # map to output space
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1) # ? what is LogSoftMax, dim = 1??

    def forward(self, input, hidden):
        # ? why the input is is 'output_size' dimension? 
        # ? why need embedding? 
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
HIDDEN_DIM = 128
N_EPOCHS = 5
BATCH_SIZE = 4
NUM_LAYERS = 1
encoder = Encoder(HIDDEN_DIM, BATCH_SIZE, NUM_LAYERS, lang)


## Prepare Data
### Dataloader

In [8]:
class SummaryDataset(Dataset):
    def __init__(self, fname, transform=None):
        self.df = pd.read_csv(FORMAL_DATA_DIR + fname, sep="\t", header = 0)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        headline = self.df["headlines"][idx]
        text = self.df["text"][idx]
        
        sample = {'headlines': headline, 'text': text}

        if self.transform:
            sample = self.transform(sample)

        return sample

dataset = SummaryDataset(fname="train.csv", transform=None)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

### Padding

In [9]:
# Pad a with the PAD symbol
def pad_seq(seq, max_length):
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

"""
Create a batch ready for feeding into the network. 
Paddings are added to ensure all sequences are the same length. 
Precondition: 
    input_seqs: a list of sequences; each element is a sequencce which is a list of words
    target_seqs: same. 
    lang: the language model
Postcondition: 
    indices version of the input and target in tensor form. 
"""
def batch(input_seqs, target_seqs, lang):
    # input_seqs and target_seqs are in string format
    
    input_seqs = lang.wordSeq2IdxSeq(input_seqs)
    target_seqs = lang.wordSeq2IdxSeq(target_seqs)
    
    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s, max(target_lengths)) for s in target_seqs]
    
    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
    
#     input_var = input_var.cuda()
#     target_var = target_var.cuda()
    
    return input_seqs, input_lengths, target_seqs, target_lengths

## Training start

In [10]:
for i_batch, sample_batched in enumerate(dataloader):
    # clear gradients, clear hidden state from the last timestep
#     gru_model.zero_grad()
#     gru_model.hidden = gru_model.init_hidden()

    input_seqs = sample_batched["text"]
    target_seqs = sample_batched["headlines"]
    
    input_seqs = [json.loads(s) for s in input_seqs]
    target_seqs = [json.loads(s) for s in target_seqs]
    
    input_seqs, target_seqs = batch(input_seqs, target_seqs, lang)
    
    input_seqs = nn.utils.rnn.pack_sequence(input_seqs) # .cuda()
    target_seqs = nn.utils.rnn.pack_sequence(target_seqs) #.cuda(); 

    

NameError: name 'PAD_token' is not defined

In [None]:



sb = next(iter(dataloader))
input_seqs = sb["text"]
target_seqs = sb["headlines"]
input_seqs = [json.loads(s) for s in input_seqs]
target_seqs = [json.loads(s) for s in target_seqs]
input_seqs, input_lengths, target_seqs, target_lengths = batch(input_seqs, target_seqs, lang)


for i_batch, sample_batched in enumerate(dataloader):
    # clear gradients, clear hidden state from the last timestep
#     gru_model.zero_grad()
#     gru_model.hidden = gru_model.init_hidden()

    input_seqs = sample_batched["text"]
    target_seqs = sample_batched["headlines"]
    
    input_seqs = [json.loads(s) for s in input_seqs]
    target_seqs = [json.loads(s) for s in target_seqs]
    
    input_seqs, input_lengths, target_seqs, target_lengths = batch(input_seqs, target_seqs, lang)
    
    input_seqs = nn.utils.rnn.pack_sequence(input_seqs) # .cuda()
    target_seqs = nn.utils.rnn.pack_sequence(target_seqs) #.cuda()

#     # forward pass
#     label_scores = gru_model(tweet_in)

#     # compute loss against true labels
#     loss = loss_function(label_scores, target)

#     # backprop the gradients and update the model parameters
#     loss.backward()
#     optimizer.step()

#     # keep track of the loss
#     running_loss += loss.item()
#     i += BATCH_SIZE
#     if i % 2000 == 0:
#         average_loss = running_loss/2000
#         if average_loss < lowest_loss:
#             lowest_loss = running_loss
#             # save our checkpoint if it is the current best
#             torch.save(gru_model.state_dict(), CHECKPOINT_FILE)
#         logging.info("running loss: %.3f @ batch %d", average_loss, batch_ind)
#         running_loss = 0.0

In [None]:
print(target_seqs)

In [None]:
# tf = transforms.Compose([ToTensor(lang)])
tf = None
dataset = SummaryDataset(fname="train.csv", transform=tf)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)


sb = next(iter(dataloader))
input_seqs = sb["text"]
target_seqs = sb["headlines"]
input_seqs = [json.loads(s) for s in input_seqs]
target_seqs = [json.loads(s) for s in target_seqs]
input_seqs, input_lengths, target_seqs, target_lengths = batch(input_seqs, target_seqs, lang)

In [None]:

def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    # Run through decoder one time step at a time
    for t in range(max_target_length):
        decoder_output, decoder_hidden, decoder_attn = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t] # Next input is current target

    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths
    )
    loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0], ec, dc

In [None]:
DF = []
for df in pd.read_csv(RAW_DATA_DIR + 'train.csv', sep="\t", header = 0, chunksize=10):
    DF  = df
    break