In [2]:
import os
import glob
import re
import json
import argparse
import time
import codecs

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from collections import defaultdict
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords

In [7]:
# get current directory
cwd = os.getcwd()

In [8]:
def read_files(path: str, label: str):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

In [9]:
def represent_text(text, n: int):
    """
    Extracts all character 'n'-grams from a given 'text'.
    Each digit is represented as a hashtag symbol (#) which in general denotes any number.
    Each hyperlink is replaced by an @ sign.
    The latter steps are computed through regular expressions.
    """    
    if n > 0:
        text = re.sub("[0-9]", "0", text) # each digit will be represented as a 0
        text = re.sub(r'( \n| \t)+', '', text)
        #text = re.sub("[0-9]+(([.,^])[0-9]+)?", "#", text)
        text = re.sub("https:\\\+([a-zA-Z0-9.]+)?", "@", text)
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
        
        if n == 2:
            # create list of unigrams that only consists of punctuation marks
            punct_unigrams = [token for token in text if token.isalnum() == False]
            tokens.extend(punct_unigrams)
            
    # create frequency text representation (keys are tokens, values are their corresponding frequencies)
    frequency = {token: tokens.count(token) for token in list(set(tokens))}
        
    return frequency

In [10]:
def extract_vocabulary(texts, n: int, ft = 3):
    
    # Extracts all characer 'n'-grams occurring at least 'ft' times in a set of 'texts'
    occurrences=defaultdict(int)
    
    for text in texts:
        
        text_occurrences=represent_text(text,n)
        
        for ngram in text_occurrences.keys():
            
            if ngram in occurrences:
                occurrences[ngram] += text_occurrences[ngram]
            else:
                occurrences[ngram] = text_occurrences[ngram]
    
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i] >= ft:
            vocabulary.append(i)
            
    return vocabulary

In [18]:
def baseline(path, outpath, n_range = 4, pt = 0.1, lower = False, words = False):
    
    start_time = time.time()
    
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
            
    stopWords = list(set(stopwords.words('english')))
    
    for index, problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
                
        # Building training set
        train_docs = []
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
            
        train_texts = [text for i,(text,label) in enumerate(train_docs)]
        train_labels = [label for i,(text,label) in enumerate(train_docs)]
        
        vocab = []
        for n in range(2, n_range + 1):
            n_vocab = extract_vocabulary(train_texts, n, (n_range - n) + 1)
            vocab.extend(n_vocab)
        
        # initialize tf-idf-vectorizer
        vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range = (2, n_range), use_idf = True, norm = 'l2', lowercase = lower, vocabulary = vocab, min_df = 0.2, max_df = 0.8)
        train_data_char = vectorizer.fit_transform(train_texts).toarray()
        
        print(train_data_char.shape)
    
        indexes = np.argsort(vectorizer.idf_)[:12000]
        train_data_char = train_data_char[:, indexes]
        
        print(train_data_char.shape)
        
        # initialize truncated singular value decomposition
        # svd = TruncatedSVD(n_components = 50, algorithm = 'randomized', random_state = 42)
        # train_data_lsa = svd.fit_transform(train_data)
        
        # print(train_data_lsa.shape)
        
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_texts), 'known texts')
        print('\t', 'vocabulary size:', len(vocab))
        
        # Building test set
        test_docs = read_files(path+os.sep+problem,unk_folder)
        test_texts = [text for i,(text,label) in enumerate(test_docs)]
        
        test_data_char = vectorizer.transform(test_texts).toarray()
        test_data_char = test_data_char[:, indexes]
        
        print(test_data_char.shape)
        
        if words == True:
            vect = CountVectorizer(analyzer = 'word', ngram_range = (1, 1), lowercase = lower, vocabulary = stopWords)
            train_data_word = vect.fit_transform(train_texts).toarray()
            test_data_word = vect.transform(test_texts).toarray()
            train_data, test_data = np.hstack((train_data_char, train_data_word)), np.hstack((test_data_char, test_data_word))
        
        else:
            train_data, test_data = train_data_char, test_data_char
            
        print(train_data.shape)
        print()
        print(test_data.shape)
        
        #test_data_lsa = svd.transform(test_data)
        
        # Applying SVM
        max_abs_scaler = preprocessing.MaxAbsScaler()
        scaled_train_data = max_abs_scaler.fit_transform(train_data) #train_data_lsa
        scaled_test_data = max_abs_scaler.transform(test_data) #train_data_lsa
        
        # model 
        clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1, gamma = 'auto')))
        clf.fit(scaled_train_data, train_labels)
        predictions=clf.predict(scaled_test_data)
        proba=clf.predict_proba(scaled_test_data)
        
        # Reject option (used in open-set cases)
        count=0
        for i,p in enumerate(predictions):
            sproba=sorted(proba[i],reverse=True)
            if sproba[0]-sproba[1]<pt:
                predictions[i]=u'<UNK>'
                count=count+1
        print('\t',count,'texts left unattributed')
        
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        for i,v in enumerate(predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
    print('elapsed time:', time.time() - start_time)

In [19]:
baseline(cwd + "\\cross-domain-authorship-attribution-train", cwd + '\\answers', words = True)

problem00001
(63, 35317)
(63, 12000)
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 35317
(561, 12000)
(63, 12179)

(561, 12179)




	 105 texts left unattributed
	 answers saved to file answers-problem00001.json
problem00002
(63, 34454)
(63, 12000)
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 34454
(137, 12000)
(63, 12179)

(137, 12179)




	 57 texts left unattributed
	 answers saved to file answers-problem00002.json
problem00003
(63, 34203)
(63, 12000)
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 34203
(211, 12000)
(63, 12179)

(211, 12179)




	 105 texts left unattributed
	 answers saved to file answers-problem00003.json
problem00004
(63, 36655)
(63, 12000)
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 36655
(273, 12000)
(63, 12179)

(273, 12179)




	 144 texts left unattributed
	 answers saved to file answers-problem00004.json
problem00005
(63, 33178)
(63, 12000)
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 33178
(264, 12000)
(63, 12179)

(264, 12179)




	 88 texts left unattributed
	 answers saved to file answers-problem00005.json
elapsed time: 204.1756820678711


In [30]:
class RNNLanguageModel:
    def __init__(self, model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder = dy.SimpleRNNBuilder):
        
        self.builder = builder(LAYERS, INPUT_DIM, HIDDEN_DIM, model)
        self.lookup = model.add_lookup_parameters((VOCAB_SIZE, INPUT_DIM), name="lookup")
        self.R = model.add_parameters((VOCAB_SIZE, HIDDEN_DIM), name="hidden2out")
        self.bias = model.add_parameters((VOCAB_SIZE), name="bias")

    def save_to_disk(self, filename):
        dy.save(filename, [self.builder, self.lookup, self.R, self.bias])

    def load_from_disk(self, filename):
        (self.builder, self.lookup, self.R, self.bias) = dy.load(filename, model)
        
    def build_lm_graph(self, sent):
        dy.renew_cg()
        init_state = self.builder.initial_state()

        errs = [] # will hold expressions
        es=[]
        state = init_state
        for (cw,nw) in zip(sent,sent[1:]):
            # assume word is already a word-id
            x_t = dy.lookup(self.lookup, int(cw))
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = self.bias + (self.R * y_t)
            err = dy.pickneglogsoftmax(r_t, int(nw))
            errs.append(err)
        nerr = dy.esum(errs)
        return nerr
    
    def predict_next_word(self, sentence):
        dy.renew_cg()
        init_state = self.builder.initial_state()
        state = init_state
        for cw in sentence:
            # assume word is already a word-id
            x_t = self.lookup[int(cw)]
            state = state.add_input(x_t)
        y_t = state.output()
        r_t = self.bias + (self.R * y_t)
        prob = dy.softmax(r_t)
        return prob
        
    def sample(self, first=1, nchars=0, stop=-1):
        res = [first]
        dy.renew_cg()
        state = self.builder.initial_state()

        cw = first
        while True:
            x_t = self.lookup[cw]
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = self.bias + (self.R * y_t)
            ydist = dy.softmax(r_t)
            dist = ydist.vec_value()
            rnd = random.random()
            for i,p in enumerate(dist):
                rnd -= p
                if rnd <= 0: break
            res.append(i)
            cw = i
            if cw == stop: break
            if nchars and len(res) > nchars: break
        return res

In [31]:
corpus = "allnames.txt"

LAYERS = 2
INPUT_DIM = 32 #50  #256
HIDDEN_DIM = 128 # 50  #1024

train = util.CharsCorpusReader(corpus, begin="<s>")
vocab = util.Vocab.from_corpus(train)

VOCAB_SIZE = vocab.size()

model = dy.Model()

trainer = dy.SimpleSGDTrainer(model, learning_rate=0.2)

lm = RNNLanguageModel(model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder=dy.SimpleRNNBuilder)
#lm = RNNLanguageModel(model, LAYERS, INPUT_DIM, HIDDEN_DIM, VOCAB_SIZE, builder=dy.LSTMBuilder)


train = list(train)

losses = []

chars = loss = 0.0

for ITER in range(3):
    random.shuffle(train)
    
    for i,sent in enumerate(train):
        _start = time.time()
        
        if i % 2500 == 0:
            trainer.status()
            print (i,len(train))
            
            if chars > 0: print(loss / chars,)
                
            for _ in range(1):
                samp = lm.sample(first=vocab.w2i["<s>"],stop=vocab.w2i["\n"])
                print("".join([vocab.i2w[c] for c in samp]).strip())
            loss = 0.0
            chars = 0.0

        chars += len(sent)-1
        isent = [vocab.w2i[w] for w in sent]
        errs = lm.build_lm_graph(isent)
        loss += errs.scalar_value()
        errs.backward()
        trainer.update()
    print ("TM:",(time.time() - _start)/len(sent))
    print("ITER {}, loss={}".format(ITER, loss))
    losses.append(loss)
    trainer.status()

lm.save_to_disk("RNNLanguageModel.model")

print("loading the saved model...")
lm.load_from_disk("RNNLanguageModel.model")
samp = lm.sample(first=vocab.w2i["<s>"],stop=vocab.w2i["\n"])
print("".join([vocab.i2w[c] for c in samp]).strip())


0 24158
<s>żzpVpúsãúIvżf ó
2500 24158
3.312249622638672
<s>Agüaer
5000 24158
3.3096209765548243
<s>Collin
7500 24158
3.315030470705376
<s>Sode
10000 24158
3.3302472654334774
<s>Araikkeskayy
12500 24158
3.341012694952305
<s>Charo
15000 24158
3.3434368001072747
<s>Elusi
17500 24158
3.341647548855079
<s>Cooper
20000 24158
3.3617703706721214
<s>At
22500 24158
3.3823963349407498
<s>Dejchenef
TM: 6.840626398722331e-05
ITER 0, loss=44670.92741680145
0 24158
3.3324078639911563
<s>Balanov
2500 24158
3.363216893229123
<s>Adniani
5000 24158
3.371980296670738
<s>Sasgol
7500 24158
3.3632297635702564
<s>Vanara
10000 24158
3.3505029714091425
<s>Jilyakov
12500 24158
3.361541686939542
<s>Gurin
15000 24158
3.3866671085333886
<s>Chena
17500 24158
3.362624272731542
<s>Eosion
20000 24158
3.3636006569254717
<s>Mis
22500 24158
3.3638998851674833
<s>Jonsi
TM: 6.514787673950195e-05
ITER 1, loss=45200.18455505371
0 24158
3.371889933237875
<s>Sapinh
2500 24158
3.365280220255356
<s>Roriechietu
5000 24158
3.359764