In [56]:

import os
import json
import pickle
import torch
import torch.nn as nn
import numpy as np
import importlib
import normalizerFunctions 
normalizerFunctions = importlib.reload(normalizerFunctions)
from normalizerFunctions import Training_Corpus, Token_Classifier
from tokenize_GSW import tokenize, PUNCT
from torch.utils import data
from torch.utils.data import DataLoader
from Testing_Dataset import Testing
from collections import Counter

In [4]:
def pad_test_sentence(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()
    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    f = torch.LongTensor
    return words, f(x), is_heads, seqlens

In [6]:

def get_normalizations(model, iterable, idx2label):
    normalized = []
    model.eval()
    for batch in iterable:
        with torch.no_grad():
            b_lines, x, b_is_heads, seqlens = batch
            y = torch.zeros_like(x, dtype=torch.long)
            _, _, b_predictions = model(x,y) # does not use y value
            b_predictions= b_predictions.detach().cpu().numpy()
            batch_norms = []
            for line, line_preds, is_heads in zip(b_lines, b_predictions, b_is_heads):
                line_preds = [pred for head, pred in zip(is_heads, line_preds) if head == 1]
                for pred in line_preds:
                    try:
                        test = idx2label[pred]
                    except KeyError:
                        idx2label[pred] = '<pad>'
                preds = [idx2label[pred] for pred in line_preds]
                assert len(preds)==len(line)
                batch_norms.append(preds[1:-1])
            normalized.append(batch_norms)
    return normalized

In [7]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(torch.backends.mps.is_available())

True


In [39]:
# load corpus files
with open('pickled_archimob.pkl', 'rb') as file:
    archimob_corpus = pickle.load(file)
with open('pickled_train_corpus.pkl', 'rb') as file:
    joined_corpus = pickle.load(file)

In [8]:
# re-initialize model
model = torch.load('token_classifier_archimob.pt')
model.to(device)
if torch.cuda.device_count() > 1:
    print(True)
    model = nn.DataParallel(model)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [13]:
# group utterances into lists of an arbitrary length so they are the same format as our songs
def group_utterances(word_norm_pairs, groupsize=10):
    grouped_list = []
    sublist = []
    for item in word_norm_pairs:
        sublist.append(item)
        if len(sublist) == groupsize:
            grouped_list.append(sublist)
            sublist = []
    if sublist:
        grouped_list.append(sublist)
    return grouped_list

In [None]:
grouped_utterances =group_utterances(archimob_corpus.word_norm_pairs)
utterances_unnormed = [[[word for word, _ in utterance] for utterance in utterances] for utterances in grouped_utterances]

In [34]:
archimob_dataset = Testing(utterances_unnormed)
archimob_iter = DataLoader(dataset=archimob_dataset,
                            batch_size=8,
                            shuffle=False,
                            num_workers=0,
                            collate_fn=pad_test_sentence)


In [57]:
archimob_normalized_by_classifier = get_normalizations(model, archimob_iter, model.idx2label)

In [None]:
# calculate Combined Model ERR for ArchiMob corpus
    
def get_model_norms_and_error_rate(list_of_utterances, model_predictions, counts):
    normed_utterances = []
    total = 0 
    hits = 0
    words_unnormed = 0
    assert len(list_of_utterances)== len(model_predictions)
    for u, p in zip(list_of_utterances,model_predictions):
        normed_utterance = []
        for (word, norm) , pred in zip(u, p):
            if word == norm:
                words_unnormed += 1
            max_key = max(counts[word], key=counts[word].get) 
            if len(counts[word])>1:       # use the model's prediction unless there is only a single normalization
                prediction = pred
            else:
                prediction = max_key
            normed_utterance.append(prediction)
            if prediction == norm:
                hits += 1
            total += 1
        normed_utterances.append(normed_utterance)
    accuracy = 100*hits/total
    unnormed = 100*words_unnormed/total
    print(accuracy)
    print(unnormed)
    Err_Red_rate = (accuracy - unnormed)/(100 - unnormed) 
    return normed_utterances, Err_Red_rate

enhanced_norms, err = get_model_norms_and_error_rate(archimob_corpus.word_norm_pairs, 
                                                    [n_utterance for group in archimob_normalized_by_classifier for n_utterance in group],
                                                    joined_corpus.norm_dict)
print(err)

In [9]:
def normalize_song_corpus(model, corpus_texts, idx2label):
    corpus_dataset = Testing(corpus_texts)
    index = 0
    normalized_songs = [] 
    zipped = []
    for song in corpus_texts:
        song_len = len(song)
        song_subset = data.Subset(corpus_dataset, range(index,index+song_len))
        song_iter = DataLoader(dataset=song_subset,
                                batch_size=512,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=pad_test_sentence)
        normalized_by_classifier = get_normalizations(model, song_iter, idx2label)
        normalized_songs.append(normalized_by_classifier)
        zipped.append(zip(song,normalized_by_classifier))
        index = index + song_len
    return normalized_songs, zipped

In [12]:
# load corpus 

corpus = "JSON_files/"
corpus_texts = []
for filename in sorted(os.listdir(corpus)):
    with open(corpus+filename) as file:
        song_json = json.load(file)
        lines = song_json["raw_text"]
        text = []
        if len(lines[1].strip())>0:     # Ignore first two lines of songs with titles (songs with an empty second line):
            start = 0
        else:
            start = 2
        text = [tokenize(line, PUNCT) for line in lines[start:] if line]
        corpus_texts.append(text)

In [16]:
# normalize corpus
normalized_songs, zipped = normalize_song_corpus(model, corpus_texts,model.idx2label)

In [22]:
# flatten sublists
normalized_songs = [song for [song] in normalized_songs]

[[['es', 'ist', 'eine', 'mantel', 'gewesen', 'wo', 'sie', 'sich', 'dann', 'begegnet', 'sie'], ['sie', 'mit', 'rückblick', 'volle', 'vorher', 'an', 'seinem', 'haus', 'vorbei'], ['wo', 'er', 'das', 'lachen', 'sieht', 'ist', 'es', 'dann', 'plötzlich', 'um', 'ihn', 'gewesen'], ['er', 'lugen', 'nach', 'bis', 'er', 'sie', 'nicht', 'kann', 'sehen'], ['sie', 'ist', 'ein', 'blütenstecher', 'hat', 'wind', 'und', 'sonne', 'im', 'her', 'und', 'er', 'eine', 'jubel'], ['ein', 'mann', 'aus', 'der', 'bergen', 'nach', 'viel', 'jahre', 'sind', 'sie', 'iz', 'endlich', 'ein', 'paar'], ['weil', 'die', 'liebe', 'acker', 'ist', 'alles', 'mengen', 'schwer', 'willt', 'habe'], [], ['es', 'ist', 'es', 'schon', 'ein', 'wil', 'her', 'vergessen', 'hat', 'er', 'sie', 'nie', 'mehr'], ['was', 'wisch', 'dann', 'du', 'mit', 'so', 'vertrockneten', 'blütenstecher'], ['haben', 'die', 'leute', 'dann', 'hoch', 'genommen'], ['wenn', 'er', 'das', 'lachen', 'sieht', 'ist', 'gäng', 'noch', 'um', 'ihn', 'gewesen'], ['sie', 'herz'

In [36]:
file = open('normalized_songs.pickle', 'rb')
old = pickle.load(file)


In [59]:
def CombinedModel(songs, model_predictions, counts):
    total_corpus = 0
    unnormed_corpus = 0
    oov = 0 
    ambiguous = 0
    normed_songs = []
    total_words = []
    assert len(songs)== len(model_predictions)
    for lines, preds in zip(songs,model_predictions):
        normed_lines = []
        for line, pred_line in zip(lines,preds):
            normed_line = []
            for word, pred in zip(line, pred_line):
                if word in counts:
                    max_key = max(counts[word], key=counts[word].get) 
                    if len(counts[word])>1:       # use the model's prediction unless there is only a single normalization
                        prediction = pred
                        ambiguous +=1 
                    else:
                        prediction = max_key
                else:
                    prediction = pred
                    oov += 1
                if prediction == word:
                    unnormed_corpus += 1
                normed_line.append(prediction)
                total_words.append(prediction)
                total_corpus+=1
            normed_lines.append(normed_line)
        normed_songs.append(normed_lines)
    print("total tokens: ",len(total_words))
    print("total unique tokens: ",len(set(total_words)))
    print("proportion unique: ",len(set(total_words))/len(total_words))
    print("total OOV: ",oov)
    print("proportion",oov/len(total_words))
    print("proportion unnormed",unnormed_corpus/len(total_words))
    print("proportion normed by model: ", (ambiguous+oov)/len(total_words))
    hapax = sum(1 for v in Counter(total_words).values() if v == 1)
    print('hapax legomena: ',hapax)
    return normed_songs, oov, unnormed_corpus, ambiguous, total_corpus

In [63]:
def MFR_Model(songs, counts):
    total_corpus = 0
    unnormed_corpus = 0
    oov = 0 
    normed_songs = []
    total_words = []
    for song in songs:
        normed_lines = []
        for line in song:
            normed_line = []
            for word in line:
                if word in counts:
                    max_key = max(counts[word], key=counts[word].get) 
                    prediction = max_key 
                else:
                    prediction = word
                    oov += 1
                if prediction == word:
                    unnormed_corpus += 1
                normed_line.append(prediction)
                total_words.append(prediction)
                total_corpus+=1
            normed_lines.append(normed_line)
        normed_songs.append(normed_lines)
    print("total tokens: ",len(total_words))
    print("total unique tokens: ",len(set(total_words)))
    print("proportion unique: ",len(set(total_words))/len(total_words))
    print("total OOV: ",oov)
    print("proportion",oov/len(total_words))
    print("proportion unnormed",unnormed_corpus/len(total_words))
    hapax = sum(1 for v in Counter(total_words).values() if v == 1)
    print('hapax legomena: ',hapax)
    return normed_songs, oov, unnormed_corpus, total_corpus

In [64]:
normed_songs_MFR, _,_,_ = MFR_Model(corpus_texts,joined_corpus.norm_dict)

total tokens:  52874
total unique tokens:  8291
proportion unique:  0.15680674811816772
total OOV:  11031
proportion 0.2086280591595113
proportion unnormed 0.5908953360820063
hapax legomena:  4927


In [62]:
normed_songs_combi_model, _,_,_,_ = CombinedModel(corpus_texts, normalized_songs, joined_corpus.norm_dict)

total tokens:  52874
total unique tokens:  4047
proportion unique:  0.07654045466580928
total OOV:  11031
proportion 0.2086280591595113
proportion unnormed 0.36933086204940047
proportion normed by model:  0.8887165714718008
hapax legomena:  1406


In [66]:
with open('corpus_texts.pkl', "wb") as file:
    pickle.dump(corpus_texts, file)
with open('corpus_normalized.pkl', "wb") as file:
    pickle.dump(normed_songs_combi_model, file)