In [41]:

import pickle
import importlib
import normalizerFunctions 
normalizerFunctions = importlib.reload(normalizerFunctions)
from normalizerFunctions import Training_Corpus, Classifier
import torch
import torch.nn as nn
import numpy as np
from torch.utils import data
from pytorch_pretrained_bert import BertTokenizer
from tqdm import tqdm_notebook as tqdm
from Normalization_Dataset import Normalization
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from Testing_Dataset import Testing

In [3]:
def pad_test_sentence(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()
    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    f = torch.LongTensor
    return words, f(x), is_heads, seqlens

In [4]:

def get_normalizations(model, iterable, idx2label):
    normalized = []
    model.eval()
    for batch in iterable:
        with torch.no_grad():
            b_lines, x, b_is_heads, seqlens = batch
            y = torch.zeros_like(x, dtype=torch.long)
            _, _, b_predictions = model(x,y) # does not use y value
            b_predictions= b_predictions.detach().cpu().numpy()
            batch_norms = []
            for line, line_preds, is_heads in zip(b_lines, b_predictions, b_is_heads):
                line_preds = [pred for head, pred in zip(is_heads, line_preds) if head == 1]
                for pred in line_preds:
                    try:
                        test = idx2label[pred]
                    except KeyError:
                        idx2label[pred] = '<pad>'
                preds = [idx2label[pred] for pred in line_preds]
                assert len(preds)==len(line)
                batch_norms.append(preds[1:-1])
            normalized.append(batch_norms)
    return normalized

def normalize_song_corpus(model, corpus_texts, idx2label):
    corpus_dataset = Testing(corpus_texts)
    index = 0
    normalized_songs = [] 
    zipped = []
    for song in corpus_texts:
        song_len = len(song)
        song_subset = data.Subset(corpus_dataset, range(index,index+song_len))
        song_iter = DataLoader(dataset=song_subset,
                                batch_size=512,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=pad_test_sentence)
        normalized_by_classifier = get_normalizations(model, song_iter, idx2label)
        normalized_songs.append(normalized_by_classifier)
        zipped.append(zip(song,normalized_by_classifier))
        index = index + song_len
    return normalized_songs, zipped

In [5]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(torch.backends.mps.is_available())

True


In [51]:
# load corpus files
with open('pickled_archimob.pkl', 'rb') as file:
    archimob_corpus = pickle.load(file)
with open('pickled_train_corpus.pkl', 'rb') as file:
    joined_corpus = pickle.load(file)

In [None]:
# re-initialize model
model = torch.load('token_classifier_archimob.pt')
# model = Classifier(vocab_size=len(alt))
model.to(device)
if torch.cuda.device_count() > 1:
    print(True)
    model = nn.DataParallel(model)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [13]:
# group utterances into lists of an arbitrary length so they are the same format as our songs
def group_utterances(word_norm_pairs, groupsize=10):
    grouped_list = []
    sublist = []
    for item in word_norm_pairs:
        sublist.append(item)
        if len(sublist) == groupsize:
            grouped_list.append(sublist)
            sublist = []
    if sublist:
        grouped_list.append(sublist)
    return grouped_list

In [None]:
grouped_utterances =group_utterances(archimob_corpus.word_norm_pairs)
utterances_unnormed = [[[word for word, _ in utterance] for utterance in utterances] for utterances in grouped_utterances]

In [34]:
archimob_dataset = Testing(utterances_unnormed)
archimob_iter = DataLoader(dataset=archimob_dataset,
                            batch_size=8,
                            shuffle=False,
                            num_workers=0,
                            collate_fn=pad_test_sentence)


In [57]:
archimob_normalized_by_classifier = get_normalizations(model, archimob_iter, model.idx2label)

In [59]:
# calculate ERR   
    
def get_model_norms_and_error_rate(list_of_utterances, model_predictions, counts):
    normed_utterances = []
    total = 0 
    hits = 0
    words_unnormed = 0
    assert len(list_of_utterances)== len(model_predictions)
    for u, p in zip(list_of_utterances,model_predictions):
        normed_utterance = []
        for (word, norm) , pred in zip(u, p):
            if word == norm:
                words_unnormed += 1
            max_key = max(counts[word], key=counts[word].get) 
            if len(counts[word])>1:       # use the model's prediction unless there is only a single normalization
                prediction = pred
            else:
                prediction = max_key
            normed_utterance.append(prediction)
            if prediction == norm:
                hits += 1
            total += 1
        normed_utterances.append(normed_utterance)
    accuracy = 100*hits/total
    unnormed = 100*words_unnormed/total
    print(accuracy)
    print(unnormed)
    Err_Red_rate = (accuracy - unnormed)/(100 - unnormed) 
    return normed_utterances, Err_Red_rate

enhanced_norms, err = get_model_norms_and_error_rate(archimob_corpus.word_norm_pairs, 
                                                    [n_utterance for group in archimob_normalized_by_classifier for n_utterance in group],
                                                    joined_corpus.norm_dict)
print(err)

94.32864518303735
24.67590538397774
0.9247072952436618
