!pip install python-Levenshtein
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode
!pip install .
%cd ..

In [434]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import * # for pad_sequence and whatnot
from torch.utils.data import Dataset, DataLoader, TensorDataset

from torch.utils import data
from torchvision import transforms

import matplotlib.pyplot as plt
import time

import json

cuda = torch.cuda.is_available()
cuda

False

!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode
!pip install .
%cd ..

### DATA PROCESSING

In [435]:
ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
NUMBERS = '0123456789'

class PreprocessedData(object):
    def __init__(self, train_file_path, test_file_path):
        """
        train_file_path = list with files
        test_file_path = list with files
        """
        
        self.train_path = train_file_path
        self.test_path = test_file_path
        
        self.VOCAB = None
        self.VOCAB_SIZE = None
        
        # Dictionary to convert char to integers
        self.char2index = None
        
        # Dataset
        self.train_data = None 
        self.dev_data = None
        self.train_labels = None
        self.dev_labels = None
        
        # Automatically run it when making an instance
        self.RUN_for_vocab()
        self.RUN_for_dataset()

        
    def get_file(self, path):
        with open(path, encoding='utf-8') as f:
            data = json.loads(json.load(f))
        return data
    
    def text_from_json(self, json_file):
        all_text = []
        for file in json_file:
            for sample in file:
                text_l = sample['text']
                for sentence in text_l:
                    sent = sentence.lower()
                    all_text.append(sent)
        return all_text
    
    ################# VOCABULARY ##############
    
    def get_all_chars(self, text_list):
        all_chars = []
        for sentences in text_list:
            for char in sentences:
                all_chars.append(char)
        chars = sorted(list(set(all_chars)))
        return chars
    
    def get_vocabulary(self, json_files):
        """
        from test json file (includes elements in training), get all unique chars
        """
        text = self.text_from_json(json_files)
        chars = self.get_all_chars(text)
        return chars
    
    def word_2_index(self, VOCAB):
        char_to_int = dict((c,i) for i,c in enumerate(VOCAB))
        self.char2index = char_to_int
    
    def RUN_for_vocab(self):
        # 1) Get json for vocabulary
        train_and_test_samples = []
        for i in range(len(self.test_path)):
            sample = self.get_file(self.test_path[i]) # Again, testpath includes train samples
            train_and_test_samples.append(sample)
            
        # 2) Get vocabulary
        self.VOCAB = self.get_vocabulary(train_and_test_samples)
        self.VOCAB_SIZE = len(self.VOCAB)
        
        # 3) Get dictionary
        self.word_2_index(self.VOCAB)
        
    ############## PROCESSING DATA ##############
    
    def remove_all_letters_in_text_tags_from_alphabet(self, alphabet, positive_tags, all_text):
        """
        Takes in an alphabet, a list of tags and a list of sentences. Returns an alphabet that correspond to the 
        negative samples by substracting the positive tags
        """
        # 1) Get alphabet cropped to the length of sentences
        idx_len = len(all_text)
        cropped_alphabet = alphabet[:idx_len]


        # 2) If not positive tags, return cropped_alpha
        if positive_tags == []:
            return cropped_alphabet

        # 3) Iterate over positive tags and remove them from cropped_alphabet. 
        for tag in positive_tags:
            new_alphabet = cropped_alphabet.replace(tag, "")
            cropped_alphabet = new_alphabet

        # 4) The result is the negative tags! :)
        return new_alphabet

        
    def get_all_text(self, files):
        """
        Parse json file and outputs train_data (text) and numpy array labels for binary classification
        """
        train_data = []
        train_labels = []
        
        for file in files:
            # iterate over the examples in file and grab positive and negative samples
            for i in range(len(file)):

                # elements from dictionary
                positive_tags = file[i]['text-tags']
                text_list = file[i]['text']

                # valid text
                valid_text = [ text_list[ALPHABET.index(letter)].lower() for letter in positive_tags ]

                # nonvalid text
                negative_tags = self.remove_all_letters_in_text_tags_from_alphabet(ALPHABET, positive_tags, text_list)
                nonvalid_text = [ text_list[ALPHABET.index(letter)].lower() for letter in negative_tags ] 

                # labels
                pos_label = np.array([0,1])
                neg_label = np.array([1,0])

    #             pos_label = np.array([1])
    #             neg_label = np.array([0])

                # store samples and labels that are not empty lists
                if len(nonvalid_text) != 0:
                    train_data.append(nonvalid_text)
                    train_labels.append(pos_label)

                if len(valid_text) != 0:
                    train_data.append(valid_text)
                    train_labels.append(neg_label)


        return train_data, np.array(train_labels)
        
    def convert_text_to_int_array(self, text, dic):
        """
        Convert text dataset to int array
        """
        all_ints = []
        for sample in text: 
            for sentence in sample:
                sent_len = len(sentence)
                sent_array = np.zeros(sent_len, dtype = int)
                for i, char in enumerate(sentence):
                    val = dic[char]
                    sent_array[i] = val
            all_ints.append(sent_array)
        return np.array(all_ints)  
    
    
    def partition_data(self, data_set, label_set, train_percentage):
        train_len = int(train_percentage*data_set.size)
        dev_len = data_set.size - train_len

        # train
        train_set = data_set[:train_len]
        train_labels = label_set[:train_len]

        # development
        dev_set = data_set[train_len:]
        dev_labels = label_set[train_len:]

        return train_set, dev_set, train_labels, dev_labels   
    
    def RUN_for_dataset(self):
        train_raw = []
        for i in range(len(self.train_path)): # list with all training data from different sections
            train_raw.append(self.get_file(self.train_path[i]))
        
        raw_dataset, labels_dataset = self.get_all_text(train_raw)
        data_set = self.convert_text_to_int_array(raw_dataset, self.char2index)
        self.train_data, self.dev_data, self.train_labels, self.dev_labels = self.partition_data(data_set, labels_dataset, .8)

In [436]:
dataset = PreprocessedData(["./data/architecture_dz-cleaned-tagged.json",
                            "./data/design_dz-cleaned-tagged.json",
                           "./data/technology_dz-cleaned-tagged.json"], 
                           ["./data/architecture_dz-cleaned.json", 
                            "./data/design_dz-cleaned.json",
                           "./data/technology_dz-cleaned.json"])

In [437]:
dataset.train_data.shape, dataset.dev_data.shape

((1782,), (446,))

### TextDataset and Collate Functions

#### Train and Development

#### Chris' code

In [438]:
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.length = len(data)

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        X = self.data[i]
        Y = self.labels[i]
        return X, Y

def collate(seq_list):
    # Get inputs shapes and sequences
    x = pad_sequence([torch.tensor(s[0]).to(DEVICE) for s in seq_list])
    lengths = torch.LongTensor([len(s[0]) for s in seq_list])

    # Assign binary classification

    targets = torch.tensor([[s[1][0].item(), s[1][0].item()] for s in seq_list])
    
    return x, lengths, targets

### Definition of RNN

In [439]:
class classifier(nn.Module):

    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout):
        super(classifier, self).__init__()

        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #lstm layer
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)

        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):

        #text = [batch size, sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim] ---> say [2, 305, 100] : 100 dimensions for each of the 305 characters 
        
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False) #, batch_first=True)
        #packed_embdded = [XXXX, emb dimension]

        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions, hid dim]
        #cell = [batch size, num layers * num directions,hid dim]

        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)

        return outputs



### MODEL SETUP 

In [440]:
DEVICE = torch.device("cuda" if cuda else "cpu")
print(DEVICE)
num_workers = 8 if cuda else 0 

batch_size_gpu = 64
batch_size_cpu = 64

size_of_vocab = dataset.VOCAB_SIZE
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 2
num_layers = 2
bidirection = True
dropout = 0.2
nepochs = 20
lr = 1e-4

# Training
train_dataset = TextDataset(dataset.train_data, dataset.train_labels)

train_loader_args = dict(shuffle=True, batch_size=batch_size_gpu, num_workers=num_workers, pin_memory=True, collate_fn=collate) if cuda\
                    else dict(shuffle=True, batch_size=batch_size_cpu, collate_fn = collate)
train_loader = data.DataLoader(train_dataset, **train_loader_args)

# Development
dev_dataset = TextDataset(dataset.dev_data, dataset.dev_labels)

dev_loader_args = dict(shuffle=False, batch_size=batch_size_gpu, num_workers=num_workers, pin_memory=True, collate_fn=collate) if cuda\
                    else dict(shuffle=False, collate_fn=collate, batch_size=batch_size_cpu)
dev_loader = data.DataLoader(dev_dataset, **dev_loader_args)

# Instantiate
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes,
                   num_layers, bidirectional=bidirection, dropout=dropout)

# Criterion & Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

cpu


In [461]:
def binary_accuracy(outs, target):
    max_index = outs.max(dim = 1).indices
    target_index = target.max(dim=1).indices
    num_correct = (max_index == target_index).sum().item()
    return num_correct / len(target_index)

In [462]:
def train_lstm(loader, model, criterion, optimizer):
    # Place model into mode and onto correct device
    start_time = time.time()
    model.train()
    model.to(DEVICE)

    running_loss = 0.0
    running_acc = 0.0

    for (data, lengths, target) in loader:
        # Zero gradients
        optimizer.zero_grad()

        # Use correct types for data
        data = data.to(DEVICE).long()
        lengths = lengths.to(DEVICE)
        target = target.to(DEVICE).float()

        # Get model outputs
        outputs = model(data, lengths)

        # Calculate loss
#         print('outputs: ', outputs.shape)
#         print(outputs)
#         print('target: ', target.shape)
#         print(target)
        loss = criterion(outputs, target)
        running_loss += loss.item()

        accuracy = binary_accuracy(outputs, target)
        running_acc += accuracy

        # Compute gradients and take step
        loss.backward()
        optimizer.step()

    running_loss /= len(loader)
    running_acc /= len(loader)
    end_time = time.time()
    print('Time: ',end_time - start_time, 's')    
    return running_loss, running_acc

In [463]:
def test_lstm(loader, model, criterion):
    with torch.no_grad():
        # Place into eval mode
        model.eval()
        model.to(DEVICE)
        running_loss = 0.0
        running_acc = 0.0

        for (data, lengths, target) in loader:
            # Use correct types for data
            data = data.to(DEVICE).long()
            lengths = lengths.to(DEVICE)
            target = target.to(DEVICE).float()

            # Get model outputs
            outputs = model(data, lengths)

            # Calculate loss
            loss = criterion(outputs, target)
            running_loss += loss.item()

            accuracy = binary_accuracy(outputs, target)
            running_acc += accuracy

    running_loss /= len(loader)
    running_acc /= len(loader)

    return running_loss, running_acc

In [464]:
def make_graph(epochs, train, test, train_name, val_name, name_long, name_short):

    plt.plot(epochs, train, 'g', label=train_name, c="mediumvioletred")
    plt.plot(epochs, test, 'b', label=val_name, c="darkturquoise")
    plt.title(name_long)
    plt.xlabel('Epochs')
    plt.ylabel(name_short)
    plt.legend()
    plt.show()
    

In [465]:
def run_epochs(model, optimizer, loader_t, loader_d, criterion, n_epochs):
    train_losses, train_accs = [], []
    test_losses, test_accs = [] , []
    epochs = []
    
    for e in range(n_epochs):
        print('----- EPOCH ------- \n', e)
        
        # Train
        train_loss, train_acc = train_lstm_Chris(loader_t, model, criterion, optimizer)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        # Test
        test_loss, test_acc = test_lstm_Chris(loader_d, model, criterion)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        # Epochs
        epochs.append(e)
        if e % 2 == 0 and e != 0:
            
            print('Training Loss: ', train_loss)
            print('Training Accuracy: ', train_acc)

        print("Train losses:\n{}\nTrain Accs:\n{}\nTest losses:\n{}\nTest accs:\n{}\n".format(train_losses, 
                                                                                             train_accs, 
                                                                                             test_losses, 
                                                                                             test_accs))
        make_graph(epochs, train_accs, test_accs, 'Training Acc', 'Testing Acc',
                   'Training and Testing Accuracy', 'Accuracy')
        make_graph(epochs, train_losses, test_losses, 'Training loss', 'Testing loss',
                   'Training and Testing loss', 'Loss')

        # save model
        torch.save(model.state_dict(), "./saved_models/v4_{}.pth".format(e))
    
    return train_losses, train_accs, test_losses, test_accs

In [466]:
path_to_load = './saved_models/v3_7.pth'
model.load_state_dict(torch.load(path_to_load, map_location=DEVICE))

<All keys matched successfully>

In [None]:
train_losses, train_accs, test_losses, test_accs = run_epochs(model, optimizer, train_loader, dev_loader, criterion, nepochs)

----- EPOCH ------- 
 0
