In [41]:
import os
import glob
import re
import json
import argparse
import time
import codecs

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from collections import defaultdict
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [8]:
# get current directory
cwd = os.getcwd()

In [26]:
def read_files(path: str, label: str):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

In [27]:
def regex(string: str, model: str):
    """
    Function that computes regular expressions.
    """
    string = re.sub("[0-9]", "0", string) # each digit will be represented as a 0
    string = re.sub(r'( \n| \t)+', '', string)
    #text = re.sub("[0-9]+(([.,^])[0-9]+)?", "#", text)
    string = re.sub("https:\\\+([a-zA-Z0-9.]+)?", "@", string)
    
    if model == 'word':
        # if model is a word n-gram model, remove all punctuation
        string = ''.join([char for char in string if char.isalnum()])
        
    if model == 'char-dist':
        string = re.sub("[a-zA-Z]+", "*", string)
        # string = ''.join(['*' if char.isalpha() else char for char in string])
        
    return string

In [28]:
def frequency(tokens: list):
    """
    Count tokens in text (keys are tokens, values are their corresponding frequencies).
    """
    freq = dict()
    for token in tokens:
        if token in freq:
            freq[token] += 1
        else:
            freq[token] = 1
    return freq

In [34]:
def represent_text(text, n: int, model: str):
    """
    Extracts all character or word 'n'-grams from a given 'text'.
    Any digit is represented through a 0.
    Each hyperlink is replaced by an @ sign.
    The latter steps are computed through regular expressions.
    """ 
    if model == 'char-std':

        text = regex(text, model)
        tokens = [text[i:i+n] for i in range(len(text)-n+1)] 

        if n == 2:
            # create list of unigrams that only consists of punctuation marks
            # and extend tokens by that list
            punct_unigrams = [token for token in text if not token.isalnum()]
            tokens.extend(punct_unigrams)

    elif model == 'word':
        text = [regex(word, model) for word in text.split() if regex(word, model)]
        tokens = [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]

    else:
        text = regex(text, model)
        tokens = tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    
    freq = frequency(tokens)

    return freq

In [35]:
def extract_vocabulary(texts: list, n: int, ft: int, model: str):
    """
    Extracts all character 'n'-grams occurring at least 'ft' times in a set of 'texts'.
    """
    occurrences = {}
    
    for text in texts:

        text_occurrences=represent_text(text, n, model)
        
        for ngram in text_occurrences.keys():
            
            if ngram in occurrences:
                occurrences[ngram] += text_occurrences[ngram]
            else:
                occurrences[ngram] = text_occurrences[ngram]
    
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i] >= ft:
            vocabulary.append(i)
            
    return vocabulary

In [36]:
def extend_vocabulary(n_range: int, n_start: int, texts: list, model: str):
    vocab = []
    for n in range(n_start, n_range + 1):
        n_vocab = extract_vocabulary(texts, n, (n_range - n) + 1, model)
        vocab.extend(n_vocab)
    return vocab

In [56]:
def baseline(path, outpath, n_start = 2, n_range = 5, pt = 0.1, n_best_factor = 0.7, 
             lower = False):
    
    start_time = time.time()
    
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
                
    for index, problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
                
        # building training set
        train_docs = []
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
            
        train_texts = [text for (text,label) in train_docs]        
        train_labels = [label for (text,label) in train_docs]
        
        # character n-gram vocabulary (syntactical features)
        vocab_char_std = extend_vocabulary(n_range, n_start, train_texts, model = 'char-std')
        
        # character n-gram vocabulary (non-diacrictics / alphabetical symbols are distorted)
        vocab_char_dist = extend_vocabulary(n_range, n_start, train_texts, model = 'char-dist')
        
        # word n-gram vocabulary (content / semantical features)
        vocab_word = extend_vocabulary(3, n_start, train_texts, model = 'word')

        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_texts), 'known texts')
        
        print('\t', 'word-based vocabulary size:', len(vocab_word))
        print('\t', 'standard character vocabulary size:', len(vocab_char_std))
        print('\t', 'non-alphabetical character vocabulary size:', len(vocab_char_dist))

        
        # building test set
        test_docs = read_files(path+os.sep+problem,unk_folder)
        test_texts = [text for (text,label) in test_docs]
        
        ## initialize tf-idf vectorizer for word n-gram model (captures content) ##
        vectorizer_word = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), use_idf = True, 
                                          norm = 'l2', lowercase = lower, vocabulary = vocab_word, 
                                          smooth_idf = True, sublinear_tf = True)

        train_data_word = vectorizer_word.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_word.idf_) * n_best_factor)
        idx_w = np.argsort(vectorizer_word.idf_)[:n_best]

        train_data_word = train_data_word[:, idx_w]

        test_data_word = vectorizer_word.transform(test_texts).toarray()
        test_data_word = test_data_word[:, idx_w]
        
        ## initialize tf-idf vectorizer for char n-gram model in which non-diacritics are distorted ##
        
        vectorizer_char_dist = TfidfVectorizer(analyzer = 'char', ngram_range = (2, n_range), use_idf = True, 
                                     norm = 'l2', lowercase = lower, vocabulary = vocab_char_dist, 
                                     min_df = 0.2, max_df = 0.8, smooth_idf = True, 
                                     sublinear_tf = True)

        train_data_char_dist = vectorizer_char_dist.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_char_dist.idf_) * n_best_factor)
        idx_c = np.argsort(vectorizer_char_dist.idf_)[:n_best]

        train_data_char_dist = train_data_char_dist[:, idx_c]

        test_data_char_dist = vectorizer_char_dist.transform(test_texts).toarray()
        test_data_char_dist = test_data_char_dist[:, idx_c]
        
        ##  initialize tf-idf vectorizer for char n-gram model (captures syntactical features) ##
        vectorizer_char_std = TfidfVectorizer(analyzer = 'char', ngram_range = (2, n_range), use_idf = True, 
                                     norm = 'l2', lowercase = lower, vocabulary = vocab_char_std, 
                                     min_df = 0.2, max_df = 0.8, smooth_idf = True, 
                                     sublinear_tf = True)

        train_data_char_std = vectorizer_char_std.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_char_std.idf_) * n_best_factor)
        idx_c = np.argsort(vectorizer_char_std.idf_)[:n_best]

        train_data_char_std = train_data_char_std[:, idx_c]

        test_data_char_std = vectorizer_char_std.transform(test_texts).toarray()
        test_data_char_std = test_data_char_std[:, idx_c]
        
        print('\t', len(test_texts), 'unknown texts')
        
        max_abs_scaler = preprocessing.MaxAbsScaler()
        
        ## scale text data for word n-gram model ##
        scaled_train_data_word = max_abs_scaler.fit_transform(train_data_word)
        scaled_test_data_word = max_abs_scaler.transform(test_data_word)
        
        ## scale text data for char dist n-gram model ##
        scaled_train_data_char_dist = max_abs_scaler.fit_transform(train_data_char_dist)
        scaled_test_data_char_dist = max_abs_scaler.transform(test_data_char_dist)
        
         ## scale text data for char std n-gram model ##
        scaled_train_data_char_std = max_abs_scaler.fit_transform(train_data_char_std)
        scaled_test_data_char_std = max_abs_scaler.transform(test_data_char_std)
        
        # initialize truncated singular value decomposition
        #svd = TruncatedSVD(n_components = 63, algorithm = 'randomized', random_state = 42)
        #scaled_train_data_char_std = svd.fit_transform(scaled_train_data_char_std)
        #scaled_test_data_char_std = svd.transform(scaled_test_data_char_std)
        
        # model
        clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1, gamma = 'auto')))
        clf.fit(scaled_train_data_char_std, train_labels)
        predictions=clf.predict(scaled_test_data_char_std)
        proba=clf.predict_proba(scaled_test_data_char_std)
        
        # Reject option (used in open-set cases)
        count=0
        for i,p in enumerate(predictions):
            sproba=sorted(proba[i],reverse=True)
            if sproba[0]-sproba[1]<pt:
                predictions[i]=u'<UNK>'
                count=count+1
        print('\t',count,'texts left unattributed')
        
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        for i,v in enumerate(predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
    print('elapsed time:', time.time() - start_time)

In [None]:
baseline(cwd + "\\cross-domain-authorship-attribution-train", cwd + '\\answers')

problem00001
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 51528
	 standard character vocabulary size: 85703
	 non-alphabetical character vocabulary size: 2762
	 561 unknown texts




In [None]:
class RNNLanguageModel:
    def __init__(self, model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder = dy.SimpleRNNBuilder):
        
        self.builder = builder(LAYERS, INPUT_DIM, HIDDEN_DIM, model)
        self.lookup = model.add_lookup_parameters((VOCAB_SIZE, INPUT_DIM), name="lookup")
        self.R = model.add_parameters((VOCAB_SIZE, HIDDEN_DIM), name="hidden2out")
        self.bias = model.add_parameters((VOCAB_SIZE), name="bias")

    def save_to_disk(self, filename):
        dy.save(filename, [self.builder, self.lookup, self.R, self.bias])

    def load_from_disk(self, filename):
        (self.builder, self.lookup, self.R, self.bias) = dy.load(filename, model)
        
    def build_lm_graph(self, sent):
        dy.renew_cg()
        init_state = self.builder.initial_state()

        errs = [] # will hold expressions
        es=[]
        state = init_state
        for (cw,nw) in zip(sent,sent[1:]):
            # assume word is already a word-id
            x_t = dy.lookup(self.lookup, int(cw))
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = self.bias + (self.R * y_t)
            err = dy.pickneglogsoftmax(r_t, int(nw))
            errs.append(err)
        nerr = dy.esum(errs)
        return nerr
    
    def predict_next_word(self, sentence):
        dy.renew_cg()
        init_state = self.builder.initial_state()
        state = init_state
        for cw in sentence:
            # assume word is already a word-id
            x_t = self.lookup[int(cw)]
            state = state.add_input(x_t)
        y_t = state.output()
        r_t = self.bias + (self.R * y_t)
        prob = dy.softmax(r_t)
        return prob
        
    def sample(self, first=1, nchars=0, stop=-1):
        res = [first]
        dy.renew_cg()
        state = self.builder.initial_state()

        cw = first
        while True:
            x_t = self.lookup[cw]
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = self.bias + (self.R * y_t)
            ydist = dy.softmax(r_t)
            dist = ydist.vec_value()
            rnd = random.random()
            for i,p in enumerate(dist):
                rnd -= p
                if rnd <= 0: break
            res.append(i)
            cw = i
            if cw == stop: break
            if nchars and len(res) > nchars: break
        return res

In [None]:
corpus = "allnames.txt"

LAYERS = 2
INPUT_DIM = 32 #50  #256
HIDDEN_DIM = 128 # 50  #1024

train = util.CharsCorpusReader(corpus, begin="<s>")
vocab = util.Vocab.from_corpus(train)

VOCAB_SIZE = vocab.size()

model = dy.Model()

trainer = dy.SimpleSGDTrainer(model, learning_rate=0.2)

lm = RNNLanguageModel(model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder=dy.SimpleRNNBuilder)
#lm = RNNLanguageModel(model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder=dy.LSTMBuilder)


train = list(train)

losses = []

chars = loss = 0.0

for ITER in range(3):
    random.shuffle(train)
    
    for i,sent in enumerate(train):
        _start = time.time()
        
        if i % 2500 == 0:
            trainer.status()
            print (i,len(train))
            
            if chars > 0: print(loss / chars,)
                
            for _ in range(1):
                samp = lm.sample(first=vocab.w2i["<s>"],stop=vocab.w2i["\n"])
                print("".join([vocab.i2w[c] for c in samp]).strip())
            loss = 0.0
            chars = 0.0

        chars += len(sent)-1
        isent = [vocab.w2i[w] for w in sent]
        errs = lm.build_lm_graph(isent)
        loss += errs.scalar_value()
        errs.backward()
        trainer.update()
    print ("TM:",(time.time() - _start)/len(sent))
    print("ITER {}, loss={}".format(ITER, loss))
    losses.append(loss)
    trainer.status()

lm.save_to_disk("RNNLanguageModel.model")

print("loading the saved model...")
lm.load_from_disk("RNNLanguageModel.model")
samp = lm.sample(first=vocab.w2i["<s>"],stop=vocab.w2i["\n"])
print("".join([vocab.i2w[c] for c in samp]).strip())
