In [1]:
import os
import time
import pickle
import numpy as np
from utils import preprocess, features, classifier, metrics, logging

In [2]:
seed = 42
np.random.seed(seed)
models_path = 'models'
data_path = 'data'
model1_file_name = 'final_model1.pkl'
model2_file_name = 'final_model2.pkl'
comp1_pred_file_name = f'comp_m1_321128258.wtag'
comp2_pred_file_name = f'comp_m2_321128258.wtag'

In [3]:
def load_datasets():
    train1_dataset = preprocess.Dataset(os.path.join(data_path, 'train1.wtag'))
    test1_dataset = preprocess.Dataset(os.path.join(data_path, 'test1.wtag'))
    comp1_dataset = preprocess.Dataset(os.path.join(data_path, 'comp1.words'), labeled=False, tags=train1_dataset.tags)

    train2_dataset = preprocess.Dataset(os.path.join(data_path, 'train2.wtag'))
    comp2_dataset = preprocess.Dataset(os.path.join(data_path, 'comp2.words'), labeled=False, tags=train2_dataset.tags)
    
    return train1_dataset, test1_dataset, comp1_dataset, train2_dataset, comp2_dataset

def load_feature_vectors(train1_dataset, train2_dataset):
    group_thresholds = {
        # -------------------------------- feature --------------------- | -- Threshold --
        lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         0,     # mandatory feature f100
        lambda t2, t1, w, i, t: tuple([w[i][-4:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-1:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][:4].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:1].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,     # mandatory feature f103
        lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,     # mandatory feature f104
        lambda t2, t1, w, i, t: tuple([t]):                                       1,     # mandatory feature f105
        lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,     # mandatory feature has_uppercase
        lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,     # mandatory feature has_digits
        lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       20,
        lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       20,
        lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i+1][:2].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i-1][:2].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i+1][-2:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i-1][-2:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     10,
        lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     10,
        lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       10,
        lambda t2, t1, w, i, t: tuple([len(w[i]), t]):                            10,
    }

    feature_vector1 = features.create_feature_vector(dataset=train1_dataset,
                                                     group_thresholds=group_thresholds,
                                                     pruning=True,
                                                     get_stats=False,
                                                     assertions=False,
                                                     calls_counter=False)

    for feat in feature_vector1.feats:
        print('feat_group:', feat, '| feats:', len(feat))
    print('feat_groups:', len(feature_vector1.feats), '| total_feats:', len(feature_vector1))
    print()
    
    feature_vector2 = features.create_feature_vector(dataset=train2_dataset,
                                                     group_thresholds=group_thresholds,
                                                     pruning=True,
                                                     get_stats=False,
                                                     assertions=False,
                                                     calls_counter=False)

    for feat in feature_vector2.feats:
        print('feat_group:', feat, '| feats:', len(feat))
    print('feat_groups:', len(feature_vector2.feats), '| total_feats:', len(feature_vector2))
    
    return feature_vector1, feature_vector2

def retrain_model1(train1_dataset, feature_vector1):
    np.random.seed(seed)
    model1 = classifier.Model(version=1,
                              w0=np.random.rand(len(feature_vector1)),
                              tags=train1_dataset.tags,
                              inference=classifier.viterbi,
                              feature_vector=feature_vector1,
                              score_func=metrics.accuracy,
                              models_path=models_path,
                              max_weights_history=0,
                              save=False)

    _, _, _ = model1.train(epochs=92,  # training epochs
                           train_dataset=train1_dataset,
                           val_dataset=None,
                           batch_size=256,  # initial batch_size for loader
                           weight_decay=0.0,  # lamda regularization parameter
                           save=False,  # save model during training (requires dill module)
                           tqdm_bar=False,  # display tqdm progress bars
                           beam=1,  # viterbi beam size for model evaluation during training
                           train_aprox=0,  # aproximate train_score with first train_aprox train samples
                           val_aprox=0,  # aproximate val_score with first val_aprox train samples 
                           batch_growth=4)  # double batch_size every batch_growth epochs
    
    model1.feature_vector = None
    with open(model1_file_name, 'wb') as f:
        pickle.dump(model1, f)
    
    model1.feature_vector = feature_vector1
    
    return model1

def retrain_model2(train2_dataset, feature_vector2):
    np.random.seed(seed)
    version = 2
    train_save = False  # save model after each training epoch, if False model will need to be saved manually
    beam = 1  # viterbi beam size for model evaluation during training
    train_aprox = 0  # aproximate train_score with first train_aprox train samples
    val_aprox = 0  # aproximate val_score with first val_aprox train samples 
    weight_decay = 0.0  # lamda regularization parameter
    init_batch_size = 250  # batch_size for loader
    batch_growth = 0
    epochs = 43  # training epochs
    tqdm_bar = False  # display tqdm progress bars

    model2 = classifier.Model(version=2,
                              w0=np.random.rand(len(feature_vector2)),
                              tags=train2_dataset.tags,
                              inference=classifier.viterbi,
                              feature_vector=feature_vector2,
                              score_func=metrics.accuracy,
                              models_path=models_path,
                              max_weights_history=0,
                              save=False)

    _, _, _ = model2.train(epochs=43,  # training epochs
                           train_dataset=train2_dataset,
                           val_dataset=None,
                           batch_size=250,  # initial batch_size for loader
                           weight_decay=0.0,  # lambda regularization parameter
                           save=False,  # save model during training (requires dill module)
                           tqdm_bar=False,  # display tqdm progress bars
                           beam=1,  # viterbi beam size for model evaluation during training
                           train_aprox=0,  # aproximate train_score with first train_aprox train samples
                           val_aprox=0,  # aproximate val_score with first val_aprox train samples
                           batch_growth=0)      # double batch_size every batch_growth epochs
    
    model2.feature_vector = None
    with open(model2_file_name, 'wb') as f:
        pickle.dump(model2, f)
    
    model2.feature_vector = feature_vector2
    
    return model2

def load_trained_models(train1_dataset, train2_dataset):
    with open(model1_file_name, "rb") as f:
        model1 = pickle.load(f)

    with open(model2_file_name, "rb") as f:
        model2 = pickle.load(f)

    model1.feature_vector, model2.feature_vector = load_feature_vectors(train1_dataset, train2_dataset)
        
    return model1, model2

def save_wtag(dataset, comp_pred_tags, version):
    comp_wtag_list = []
    for i in range(len(dataset.sentences)):
        joined_sentence = []
        assert len(dataset.sentences[i][0]) == len(comp_pred_tags[i]), \
            f'i={i}, len(dataset.sentences[i][0])={len(dataset.sentences[i][0])}, len(comp_pred_tags[i])={len(comp_pred_tags[i])}'
        for word, pred in zip(dataset.sentences[i][0], comp_pred_tags[i]):
            joined_sentence.append('_'.join([word, pred]))
        comp_wtag_list.append(' '.join(joined_sentence))

    with open(f'comp_m{version}_321128258.wtag', 'w') as f:
        for row in comp_wtag_list:
            f.write(row)
            f.write('\n')

In [4]:
np.random.seed(seed)
train1_dataset, test1_dataset, comp1_dataset, train2_dataset, comp2_dataset = load_datasets()

# # retrain models
# feature_vector1, feature_vector2 = load_feature_vectors(train1_dataset, train2_dataset)
# model1 = retrain_model1(train1_dataset, feature_vector1) 
# model2 = retrain_model2(train2_dataset, feature_vector2)

# load models
model1, model2 = load_trained_models(train1_dataset, train2_dataset)

feat_group: FeatureGroup(tuple([w[i].lower(), t])) | feats: 14719
feat_group: FeatureGroup(tuple([w[i][-4:].lower(), t])) | feats: 2289
feat_group: FeatureGroup(tuple([w[i][-3:].lower(), t])) | feats: 1757
feat_group: FeatureGroup(tuple([w[i][-2:].lower(), t])) | feats: 955
feat_group: FeatureGroup(tuple([w[i][-1:].lower(), t])) | feats: 248
feat_group: FeatureGroup(tuple([w[i][:4].lower(), t])) | feats: 2540
feat_group: FeatureGroup(tuple([w[i][:3].lower(), t])) | feats: 2431
feat_group: FeatureGroup(tuple([w[i][:2].lower(), t])) | feats: 1432
feat_group: FeatureGroup(tuple([w[i][:1].lower(), t])) | feats: 392
feat_group: FeatureGroup(tuple([t2, t1, t])) | feats: 5192
feat_group: FeatureGroup(tuple([t1, t])) | feats: 908
feat_group: FeatureGroup(tuple([t])) | feats: 44
feat_group: FeatureGroup(tuple([w[i].islower(), t])) | feats: 75
feat_group: FeatureGroup(tuple([any(char.isdigit() for char in w[i]), t])) | feats: 49
feat_group: FeatureGroup(tuple([w[i-1].lower(), t])) | feats: 616
f

In [5]:
# train1_dataset, test1_dataset, comp1_dataset, train2_dataset, comp2_dataset = load_datasets()

# model1_test1 = classifier.load_model(from_file='models/V1.2/model_V1.2_E092_beam1_test1_acc0.9532.pth', prints=False)
# model1 = classifier.load_model(from_file='models/V1.2/model_V1.2_E092_beam10_comp_acc0.9275.pth', prints=False)
# model2 = classifier.load_model(from_file='models/V2.1/model_V2.1_E043_beam1_comp_acc0.9264.pth', prints=False)

In [6]:
# test1_pred_tags, test1_true_tags = model1_test1.comp_preds[1]['pred_tags'], model1_test1.comp_preds[1]['true_tags']
# test1_accuracy = model1.score_func(test1_pred_tags, test1_true_tags)
# test1_confusion_matrix, test1_tags_accuracy = metrics.confusion_matrix(train1_dataset.tags, test1_pred_tags, test1_true_tags)

# worst10_test1_confusion_matrix = test1_confusion_matrix.loc[list(test1_tags_accuracy.keys())[:10], list(test1_tags_accuracy.keys())[:10]]
# worst10_test1_tags_accuracy = list(test1_tags_accuracy)[:10]

# # comp1_pred_tags, _ = model1.predict(comp1_dataset.sentences, beam=5, tqdm_bar=False)
# # comp2_pred_tags, _ = model2.predict(comp2_dataset.sentences, beam=1, tqdm_bar=False)
# comp1_5_pred_tags, _ = model1.comp_preds[5]['pred_tags'], None
# comp1_10_pred_tags, _ = model1.comp_preds[10]['pred_tags'], None
# comp2_pred_tags, _ = model2.comp_preds[1]['pred_tags'], None

# # save_wtag(comp1_dataset, comp1_pred_tags, 1)
# # save_wtag(comp2_dataset, comp2_pred_tags, 2)

# # model1.plot(['train_loss'], 'loss', 'loss', scale='linear', basey=10)
# # model1.plot(['batch_size'], 'batch_size', 'batch_size', scale='log', basey=2)

# # model2.plot(['train_loss'], 'loss', 'loss', scale='linear', basey=10)
# # model2.plot(['batch_size'], 'batch_size', 'batch_size', scale='log', basey=2)

In [12]:
# comp1_pred_dataset = preprocess.Dataset('/mnt/d/Downloads/comp_m1_312146897.wtag')
# comp2_pred_dataset = preprocess.Dataset('/mnt/d/Downloads/comp_m2_312146897.wtag')
comp1_pred_dataset = preprocess.Dataset('/mnt/d/Downloads/comp_m1_312146897.wtag')
comp2_pred_dataset = preprocess.Dataset('/mnt/d/Downloads/comp_m2_312146897.wtag')
comp1_true_dataset = preprocess.Dataset(os.path.join(data_path, 'comp1_tagged.wtag'))
comp2_true_dataset = preprocess.Dataset(os.path.join(data_path, 'comp2_tagged.wtag'))

comp1_accuracy = model1.score_func([sentence[1] for sentence in comp1_pred_dataset.sentences],
                                   [sentence[1] for sentence in comp1_true_dataset.sentences])
comp2_accuracy = model1.score_func([sentence[1] for sentence in comp2_pred_dataset.sentences],
                                   [sentence[1] for sentence in comp2_true_dataset.sentences])

# comp2_accuracy = model2.score_func([sentence[1] for sentence in comp2_pred_dataset.sentences],
#                                    [sentence[1] for sentence in comp2_true_dataset.sentences])

In [13]:
# print('test1_accuracy:', test1_accuracy)
print('comp1_accuracy:', comp1_accuracy)
print('comp2_accuracy:', comp2_accuracy)

comp1_accuracy: 0.005428559311251537
comp2_accuracy: 0.0


In [7]:
%pwd

'/mnt/c/Users/alexz/OneDrive/MainEnv/nlp/MEMM_Part_of_Speech_Tagging'

In [7]:
# comp2_true_dataset = preprocess.Dataset(os.path.join(data_path, 'comp2_tagged.wtag'))
# comp2_pred_dataset = preprocess.Dataset('test_t2_word3_beam6.txt')
# comp2_accuracy = model2.score_func([sentence[1] for sentence in comp2_pred_dataset.sentences],
#                                    [sentence[1] for sentence in comp2_true_dataset.sentences])
# comp2_accuracy

0.9133936759838909

In [18]:
print(comp1_pred_dataset.sentences[0][1])
print(comp1_true_dataset.sentences[0][1])

['#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#']
['NNS', 'VBD', 'EX', 'VBD', 'DT', 'JJ', 'NNS', 'IN', 'NNP', 'NNP', 'CC', 'NNP', 'NNP', 'NNP', 'NNP', 'NNS', 'IN', 'NNS', 'IN', 'JJ', 'NN', 'POS', 'JJ', 'NN', 'IN', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NNS', 'VBD', 'TO', 'VB', 'NN', 'IN', 'JJ', 'NNS', '.']


In [19]:
import numpy as np


class Dataset():
    def __init__(self, file_name, labeled=True, tags=None):
        """
        args:
            * file_name
            * labeled - is the dataset labeled?
            * tags - if dataset is not labeled, what are the tags
        """
        self.words_counter = 0
        self.sentences = []
        self.labeled = labeled
        self.tags = set()
        if not labeled and tags:
            self.tags = set(tags)
        with open(file_name, 'r') as f:
            for line in f.readlines():
                words = []
                tags = []
                for word_tag in line.split():
                    if self.labeled:
                        word, tag = word_tag.split('_')
                        self.tags.add(tag)
                    else:
                        word = word_tag
                        tag = None
                    tags.append(tag)
                    words.append(word)
                    self.words_counter += 1
                    
                self.sentences.append((words, tags))
        self.batch_loader = self.sentences.copy()
        self.shuffled = False
        
    def _init_loader(self, shuffle, seed, new=False):
        new_batch_loader = self.sentences.copy()
        if shuffle:
            np.random.seed(seed)
            np.random.shuffle(new_batch_loader)
        self.shuffled = shuffle
        if new:
            self.batch_loader = new_batch_loader
        else:
            self.batch_loader.extend(new_batch_loader)

    def load_batch(self, batch_size=None, shuffle=False, seed=42):
        """
        args:
            * batch_size=None - batch_size to load, if batch_size=None -> batch_size=len(self.sentences)
            * shuffle - reshuffle loaded sentences
            * seed - set random.seed
        return:
            * generator that iterates batch_size of sentences and words yields tuples of t2, t1, w, i, t
        """
        if not batch_size:
            batch_size = len(self.sentences)
        if self.shuffled != shuffle:
            self._init_loader(shuffle, seed, new=True)
        if len(self.batch_loader) < batch_size:
            self._init_loader(shuffle, seed)
            
        sentences = self.batch_loader[:batch_size]
        del self.batch_loader[:batch_size]
        for w, tags in sentences:
            t1, t = '*', '*'
            for i in range(len(w)):
                t2, t1, t = t1, t, tags[i]
                yield t2, t1, w, i, t

    def __len__(self):
        return self.words_counter

    def __iter__(self):
        for w, tags in self.sentences:
            t1, t = '*', '*'
            for i in range(len(w)):
                t2, t1, t = t1, t, tags[i]
                yield t2, t1, w, i, t


def accuracy(pred_tags, true_tags):
    correct = 0
    total = 0
    for preds, tags in zip(pred_tags, true_tags):
        if isinstance(preds, list):
            for pred, tag in zip(preds, tags):
                total += 1
                if pred == tag:
                    correct += 1
        else:
            total += 1
            if preds == tags:
                correct += 1

    if total > 0:
        return float(correct)/total
    return 0.0


comp1_pred_dataset = Dataset('/mnt/d/Downloads/comp_m1_312146897.wtag')
comp1_true_dataset = Dataset(os.path.join(data_path, 'comp1_tagged.wtag'))
comp1_accuracy = accuracy([sentence[1] for sentence in comp1_pred_dataset.sentences],
                          [sentence[1] for sentence in comp1_true_dataset.sentences])

comp2_pred_dataset = Dataset('/mnt/d/Downloads/comp_m2_312146897.wtag')
comp2_true_dataset = Dataset(os.path.join(data_path, 'comp2_tagged.wtag'))
comp2_accuracy = accuracy([sentence[1] for sentence in comp2_pred_dataset.sentences],
                          [sentence[1] for sentence in comp2_true_dataset.sentences])

print('comp1_accuracy:', comp1_accuracy)
print('comp2_accuracy:', comp2_accuracy)

comp1_accuracy: 0.005428559311251537
comp2_accuracy: 0.0
