In [121]:
import os
import glob
import re
import json
import argparse
import time
import codecs

import numpy as np
import pandas as pd

from pan19_cdaa_evaluator import *
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# get current directory
cwd = os.getcwd()

In [3]:
def read_files(path: str, label: str):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

In [134]:
def regex(string: str, model: str):
    """
    Function that computes regular expressions.
    """
    string = re.sub("[0-9]", "0", string) # each digit will be represented as a 0
    string = re.sub(r'( \n| \t)+', '', string)
    #text = re.sub("[0-9]+(([.,^])[0-9]+)?", "#", text)
    string = re.sub("https:\\\+([a-zA-Z0-9.]+)?", "@", string)
    
    if model == 'word':
        # if model is a word n-gram model, remove all punctuation
        string = ''.join([char for char in string if char.isalnum()])
        
    if model == 'char-dist':
        string = re.sub("[a-zA-Z]+", "*", string)
        # string = ''.join(['*' if char.isalpha() else char for char in string])
        
    return string

In [135]:
def frequency(tokens: list):
    """
    Count tokens in text (keys are tokens, values are their corresponding frequencies).
    """
    freq = dict()
    for token in tokens:
        if token in freq:
            freq[token] += 1
        else:
            freq[token] = 1
    return freq

In [136]:
def represent_text(text, n: int, model: str):
    """
    Extracts all character or word 'n'-grams from a given 'text'.
    Any digit is represented through a 0.
    Each hyperlink is replaced by an @ sign.
    The latter steps are computed through regular expressions.
    """ 
    if model == 'char-std' or model == 'char-dist':

        text = regex(text, model)
        tokens = [text[i:i+n] for i in range(len(text)-n+1)] 

        if model == 'char-std' and n == 2:
            # create list of unigrams that only consists of punctuation marks
            # and extend tokens by that list
            punct_unigrams = [token for token in text if not token.isalnum()]
            tokens.extend(punct_unigrams)

    elif model == 'word':
        text = [regex(word, model) for word in text.split() if regex(word, model)]
        tokens = [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]
    
    freq = frequency(tokens)

    return freq

In [137]:
def extract_vocabulary(texts: list, n: int, ft: int, model: str):
    """
    Extracts all character 'n'-grams occurring at least 'ft' times in a set of 'texts'.
    """
    occurrences = {}
    
    for text in texts:

        text_occurrences=represent_text(text, n, model)
        
        for ngram in text_occurrences.keys():
            
            if ngram in occurrences:
                occurrences[ngram] += text_occurrences[ngram]
            else:
                occurrences[ngram] = text_occurrences[ngram]
    
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i] >= ft:
            vocabulary.append(i)
            
    return vocabulary

In [138]:
def extend_vocabulary(n_range: tuple, texts: list, model: str):
    n_start, n_end = n_range
    vocab = []
    for n in range(n_start, n_end + 1):
        n_vocab = extract_vocabulary(texts, n, (n_end - n) + 1, model)
        vocab.extend(n_vocab)
    return vocab

In [139]:
def baseline(path, outpath, word_range: tuple, dist_range: tuple, char_range: tuple, pt = 0.1, n_best_factor = 0.5, 
             lower = False, use_LSA = False):

    start_time = time.time()
    
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
                
    for index, problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
                
        # building training set
        train_docs = []
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
            
        train_texts = [text for (text,label) in train_docs]        
        train_labels = [label for (text,label) in train_docs]
        
        # word n-gram vocabulary (content / semantical features)
        vocab_word = extend_vocabulary(word_range, train_texts, model = 'word')
        
        # character n-gram vocabulary (non-diacrictics / alphabetical symbols are distorted)
        vocab_char_dist = extend_vocabulary(dist_range, train_texts, model = 'char-dist')
        
        # character n-gram vocabulary (syntactical features)
        vocab_char_std = extend_vocabulary(char_range, train_texts, model = 'char-std')
        
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_texts), 'known texts')
        
        print('\t', 'word-based vocabulary size:', len(vocab_word))
        print('\t', 'standard character vocabulary size:', len(vocab_char_std))
        print('\t', 'non-alphabetical character vocabulary size:', len(vocab_char_dist))

        
        # building test set
        test_docs = read_files(path+os.sep+problem,unk_folder)
        test_texts = [text for (text,label) in test_docs]
        
        ## initialize tf-idf vectorizer for word n-gram model (captures content) ##
        vectorizer_word = TfidfVectorizer(analyzer = 'word', ngram_range = word_range, use_idf = True, 
                                          norm = 'l2', lowercase = lower, vocabulary = vocab_word, 
                                          smooth_idf = True, sublinear_tf = True)

        train_data_word = vectorizer_word.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_word.idf_) * n_best_factor)
        idx_w = np.argsort(vectorizer_word.idf_)[:n_best]

        train_data_word = train_data_word[:, idx_w]

        test_data_word = vectorizer_word.transform(test_texts).toarray()
        test_data_word = test_data_word[:, idx_w]
        
        ## initialize tf-idf vectorizer for char n-gram model in which non-diacritics are distorted ##
        
        vectorizer_char_dist = TfidfVectorizer(analyzer = 'char', ngram_range = dist_range, use_idf = True, 
                                     norm = 'l2', lowercase = lower, vocabulary = vocab_char_dist, 
                                     min_df = 0.1, max_df = 0.8, smooth_idf = True, 
                                     sublinear_tf = True)

        train_data_char_dist = vectorizer_char_dist.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_char_dist.idf_) * n_best_factor)
        idx_c = np.argsort(vectorizer_char_dist.idf_)[:n_best]

        train_data_char_dist = train_data_char_dist[:, idx_c]

        test_data_char_dist = vectorizer_char_dist.transform(test_texts).toarray()
        test_data_char_dist = test_data_char_dist[:, idx_c]
        
        ##  initialize tf-idf vectorizer for char n-gram model (captures syntactical features) ##
        vectorizer_char_std = TfidfVectorizer(analyzer = 'char', ngram_range = char_range, use_idf = True, 
                                     norm = 'l2', lowercase = lower, vocabulary = vocab_char_std, 
                                     min_df = 0.1, max_df = 0.8, smooth_idf = True, 
                                     sublinear_tf = True)

        train_data_char_std = vectorizer_char_std.fit_transform(train_texts).toarray()

        n_best = int(len(vectorizer_char_std.idf_) * n_best_factor)
        idx_c = np.argsort(vectorizer_char_std.idf_)[:n_best]

        train_data_char_std = train_data_char_std[:, idx_c]

        test_data_char_std = vectorizer_char_std.transform(test_texts).toarray()
        test_data_char_std = test_data_char_std[:, idx_c]
        
        print('\t', len(test_texts), 'unknown texts')
        
        max_abs_scaler = preprocessing.MaxAbsScaler()
        
        ## scale text data for word n-gram model ##
        scaled_train_data_word = max_abs_scaler.fit_transform(train_data_word)
        scaled_test_data_word = max_abs_scaler.transform(test_data_word)
        
        ## scale text data for char dist n-gram model ##
        scaled_train_data_char_dist = max_abs_scaler.fit_transform(train_data_char_dist)
        scaled_test_data_char_dist = max_abs_scaler.transform(test_data_char_dist)
        
         ## scale text data for char std n-gram model ##
        scaled_train_data_char_std = max_abs_scaler.fit_transform(train_data_char_std)
        scaled_test_data_char_std = max_abs_scaler.transform(test_data_char_std)
        
        if use_LSA:
            
            # initialize truncated singular value decomposition
            svd = TruncatedSVD(n_components = 63, algorithm = 'randomized', random_state = 42)    
            
            # Word
            scaled_train_data_word = svd.fit_transform(scaled_train_data_word)
            scaled_test_data_word = svd.transform(scaled_test_data_word)

            # Dist
            scaled_train_data_char_dist = svd.fit_transform(scaled_train_data_char_dist)
            scaled_test_data_char_dist = svd.transform(scaled_test_data_char_dist)

            # Char
            scaled_train_data_char_std = svd.fit_transform(scaled_train_data_char_std)
            scaled_test_data_char_std = svd.transform(scaled_test_data_char_std)
        
        word = CalibratedClassifierCV(OneVsRestClassifier(SVC(C = 1, kernel = 'linear', 
                                                              gamma = 'auto')))
        word.fit(scaled_train_data_word, train_labels)
        preds_word = word.predict(scaled_test_data_word)
        probas_word = word.predict_proba(scaled_test_data_word)
        
        char_dist = CalibratedClassifierCV(OneVsRestClassifier(SVC(C = 1, kernel = 'linear', 
                                                                   gamma = 'auto')))
        char_dist.fit(scaled_train_data_char_dist, train_labels)
        preds_dist = char_dist.predict(scaled_test_data_char_dist)
        probas_dist = char_dist.predict_proba(scaled_test_data_char_dist)
        
        char_std = CalibratedClassifierCV(OneVsRestClassifier(SVC(C = 1, kernel = 'linear', 
                                                                  gamma = 'auto')))
        char_std.fit(scaled_train_data_char_std, train_labels)
        preds_char = char_std.predict(scaled_test_data_char_std)
        probas_char = char_std.predict_proba(scaled_test_data_char_std)
        
        # Soft Voting procedure (combines the votes of the three individual classifier)
        avg_probas = np.average([probas_word, probas_dist, probas_char], axis = 0)        
        avg_predictions = []
        for text_probs in avg_probas:
            ind_best = np.argmax(text_probs)
            avg_predictions.append(candidates[ind_best])

        # Reject option (used in open-set cases)
        count=0
        for i,p in enumerate(avg_predictions):
            sproba=sorted(avg_probas[i],reverse=True)
            if sproba[0]-sproba[1] < pt or max(sproba) < 0.25:
                avg_predictions[i]=u'<UNK>'
                count=count+1
        print('\t',count,'texts left unattributed')
        
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        
        for i,v in enumerate(avg_predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
            
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
        
    print('elapsed time:', time.time() - start_time)

In [140]:
baseline(cwd + "\\cross-domain-authorship-attribution-train", cwd + '\\answers', word_range = (1,3), dist_range = (1,3), char_range = (2,5), use_LSA = True)

problem00001
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 53667
	 standard character vocabulary size: 85703
	 non-alphabetical character vocabulary size: 659
	 561 unknown texts




	 104 texts left unattributed
	 answers saved to file answers-problem00001.json
problem00002
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 54699
	 standard character vocabulary size: 84546
	 non-alphabetical character vocabulary size: 637
	 137 unknown texts




	 60 texts left unattributed
	 answers saved to file answers-problem00002.json
problem00003
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 52153
	 standard character vocabulary size: 83414
	 non-alphabetical character vocabulary size: 712
	 211 unknown texts




	 105 texts left unattributed
	 answers saved to file answers-problem00003.json
problem00004
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 52832
	 standard character vocabulary size: 87809
	 non-alphabetical character vocabulary size: 721
	 273 unknown texts




	 158 texts left unattributed
	 answers saved to file answers-problem00004.json
problem00005
	 language:  en
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 53950
	 standard character vocabulary size: 82383
	 non-alphabetical character vocabulary size: 623
	 264 unknown texts




	 109 texts left unattributed
	 answers saved to file answers-problem00005.json
problem00006
	 language:  fr
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 51022
	 standard character vocabulary size: 81084
	 non-alphabetical character vocabulary size: 969
	 121 unknown texts




	 32 texts left unattributed
	 answers saved to file answers-problem00006.json
problem00007
	 language:  fr
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 51210
	 standard character vocabulary size: 81740
	 non-alphabetical character vocabulary size: 1107
	 92 unknown texts




	 33 texts left unattributed
	 answers saved to file answers-problem00007.json
problem00008
	 language:  fr
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 50152
	 standard character vocabulary size: 85185
	 non-alphabetical character vocabulary size: 1162
	 430 unknown texts




	 180 texts left unattributed
	 answers saved to file answers-problem00008.json
problem00009
	 language:  fr
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 50307
	 standard character vocabulary size: 81966
	 non-alphabetical character vocabulary size: 960
	 239 unknown texts




	 143 texts left unattributed
	 answers saved to file answers-problem00009.json
problem00010
	 language:  fr
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 50154
	 standard character vocabulary size: 83877
	 non-alphabetical character vocabulary size: 1032
	 38 unknown texts




	 22 texts left unattributed
	 answers saved to file answers-problem00010.json
problem00011
	 language:  it
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 55050
	 standard character vocabulary size: 86177
	 non-alphabetical character vocabulary size: 1116
	 139 unknown texts




	 63 texts left unattributed
	 answers saved to file answers-problem00011.json
problem00012
	 language:  it
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 54715
	 standard character vocabulary size: 83815
	 non-alphabetical character vocabulary size: 937
	 116 unknown texts




	 39 texts left unattributed
	 answers saved to file answers-problem00012.json
problem00013
	 language:  it
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 54103
	 standard character vocabulary size: 78772
	 non-alphabetical character vocabulary size: 967
	 196 unknown texts




	 70 texts left unattributed
	 answers saved to file answers-problem00013.json
problem00014
	 language:  it
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 54987
	 standard character vocabulary size: 83066
	 non-alphabetical character vocabulary size: 1073
	 46 unknown texts




	 15 texts left unattributed
	 answers saved to file answers-problem00014.json
problem00015
	 language:  it
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 55003
	 standard character vocabulary size: 85154
	 non-alphabetical character vocabulary size: 1058
	 54 unknown texts




	 25 texts left unattributed
	 answers saved to file answers-problem00015.json
problem00016
	 language:  sp
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 57325
	 standard character vocabulary size: 84536
	 non-alphabetical character vocabulary size: 1076
	 164 unknown texts




	 38 texts left unattributed
	 answers saved to file answers-problem00016.json
problem00017
	 language:  sp
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 56881
	 standard character vocabulary size: 88031
	 non-alphabetical character vocabulary size: 1100
	 112 unknown texts




	 50 texts left unattributed
	 answers saved to file answers-problem00017.json
problem00018
	 language:  sp
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 56419
	 standard character vocabulary size: 82861
	 non-alphabetical character vocabulary size: 1144
	 238 unknown texts




	 77 texts left unattributed
	 answers saved to file answers-problem00018.json
problem00019
	 language:  sp
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 55713
	 standard character vocabulary size: 82986
	 non-alphabetical character vocabulary size: 996
	 450 unknown texts




	 203 texts left unattributed
	 answers saved to file answers-problem00019.json
problem00020
	 language:  sp
	 9 candidate authors
	 63 known texts
	 word-based vocabulary size: 55811
	 standard character vocabulary size: 81896
	 non-alphabetical character vocabulary size: 1105
	 170 unknown texts




	 113 texts left unattributed
	 answers saved to file answers-problem00020.json
elapsed time: 203.40237283706665




In [141]:
evaluate_all("cross-domain-authorship-attribution-train", "answers", "evaluation")

problem00001 Macro-F1: 0.857
problem00002 Macro-F1: 0.56
problem00003 Macro-F1: 0.719
problem00004 Macro-F1: 0.559
problem00005 Macro-F1: 0.582
problem00006 Macro-F1: 0.793
problem00007 Macro-F1: 0.622
problem00008 Macro-F1: 0.643
problem00009 Macro-F1: 0.732
problem00010 Macro-F1: 0.621
problem00011 Macro-F1: 0.757
problem00012 Macro-F1: 0.683
problem00013 Macro-F1: 0.784
problem00014 Macro-F1: 0.873
problem00015 Macro-F1: 0.775
problem00016 Macro-F1: 0.83
problem00017 Macro-F1: 0.723
problem00018 Macro-F1: 0.842
problem00019 Macro-F1: 0.683
problem00020 Macro-F1: 0.453
Overall score: 0.705
