In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import json
import glob
import codecs
import nltk
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
eng_stopwords = set(stopwords.words("english"))

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [4]:
def represent_text(text,n):
    # Extracts all character 'n'-grams from  a 'text'
    if n>0:
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

In [5]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

In [6]:
def extract_vocabulary(texts,n,ft):
    # Extracts all characer 'n'-grams occurring at least 'ft' times in a set of 'texts'
    occurrences=defaultdict(int)
    for (text,label) in texts:
        text_occurrences=represent_text(text,n)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

In [7]:
path = '../input/pan18crossdomainauthorshipattribution/pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02'

In [8]:
infocollection = path+os.sep+'collection-info.json'
problems = []
all_train_texts = []
all_labels = []
all_test_texts = []
with open(infocollection, 'r') as f:
    for attrib in json.load(f):
        problems.append(attrib['problem-name'])
for index,problem in enumerate(problems):
#     print(problem)
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    # Building training set
    train_docs=[]
    for candidate in candidates:
        train_docs.extend(read_files(path+os.sep+problem,candidate))
    train_texts = [text for i,(text,label) in enumerate(train_docs)]
    train_labels = [label for i,(text,label) in enumerate(train_docs)]
    test_docs=read_files(path+os.sep+problem,unk_folder)
    test_texts = [text for i,(text,label) in enumerate(test_docs)]
    
#     print('\t', len(candidates), 'candidate authors')
#     print('\t', len(train_texts), 'known texts')
    all_train_texts.append(train_texts)
    all_labels.append(train_labels)
    all_test_texts.append(test_texts)

In [9]:
train1 = all_train_texts[0]
test1 = all_test_texts[0]

In [10]:
lab_en = preprocessing.LabelEncoder()
labels1 = lab_en.fit_transform(all_labels[0])

In [11]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

Meta Features:

In [12]:
sent_len = []
sent_len.append(np.mean(list(map(
            lambda x: len(x.split()), sent_tokenize(train1[0])))))

In [14]:
def meta_features_extractort(corpus):
    sent_len = []
    word_len = []
    word_num = []
    single_num = []
    punct_num = []
    tit_num = []
    stop_num = []
    upper_num = []
    for paragraph in corpus:
        ## average lenth of sentences
        sent_len.append(np.mean(list(map(
            lambda x: len(x.split()), sent_tokenize(paragraph)))))
        ## average lenth of words
        word_len.append(np.mean(list(map(
            lambda x: len(str(x)), word_tokenize(paragraph)))))
        ##number of words
        word_num.append(len(word_tokenize(paragraph)))
        ##number of single words
        single_num.append(len([w for w in set(word_tokenize(paragraph)) if w not in punctuation]))
        ## average number of punctuation in a sentence
        punct_num.append(np.mean(list(map(
            lambda x: len([p for p in str(x) if p in punctuation]),sent_tokenize(paragraph)))))
        ##average number of  titles words
        tit_num.append(np.mean(list(map(
            lambda x: len([t for t in str(x) if t.istitle()]),sent_tokenize(paragraph)))))
        ##number of stopwords
        stop_num.append(np.mean(list(map(
            lambda x: len([t for t in str(x) if t in eng_stopwords]),sent_tokenize(paragraph)))))
        ## Number of upper words in the text ##
        upper_num.append(np.mean(list(map(
            lambda x: len([t for t in str(x) if t.isupper()]),sent_tokenize(paragraph)))))
    
        x = np.array([sent_len,word_len,word_num,single_num,punct_num,tit_num,stop_num,upper_num])
    
    return x.T

In [28]:
meta_train = meta_features_extractort(train1)
meta_testX = meta_features_extractort(test1)

In [16]:
meta_trainX, meta_validX, ytrain, yvalid = train_test_split(meta_train, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

Text Based Features :

In [17]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train1, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [27]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
all_tfidf = tfv.fit_transform(train1)
tfv_testX = tfv.transform(test1)

In [26]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

all_ctv = ctv.fit_transform(train1)
ctv_testX = ctv.transform(test1)

word embedding, you need to download https://nlp.stanford.edu/projects/glove/ , or searching GloVe in kaggle(840B 300d) 

In [23]:
embeddings_index = {}
f = open('../input/glove840b300dtxt/glove.840B.300d.txt',encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [02:50, 12846.41it/s]

Found 2195893 word vectors.





In [25]:
#I decided to transform whole article to a vector
def sent2vec(corpus):
    words = str(corpus).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in eng_stopwords]#delete high frequency words
    words = [w for w in words if w.isalpha] # only alpha
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

words_vector = np.array([sent2vec(x) for x in train1])
embedding_testX = np.array([sent2vec(x) for x in test1])

All the features are extracted, now prepare train and valid, data I decided to use train/test split

In [34]:
meta_trainX, meta_validX, meta_trainY, meta_validY = train_test_split(meta_train, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
tfv_trainX, tfv_validX, tfv_trainY, tfv_validY = train_test_split(all_tfidf, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
ctv_trainX, ctv_validX, ctv_trainY, ctv_validY = train_test_split(all_ctv, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
embedding_trainX, embedding_validX, embedding_trainY, embedding_validY = train_test_split(words_vector, labels1, 
                                                  stratify=labels1, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

the names of test set: meta_testX,  tfv_testX,  ctv_testX,  embedding_testX

now feed them into model

In [35]:
def bulid_LR(xtrain,xvalid,ytrain,yvalid,xtest):
    log = LogisticRegression()
   
    log_parameters = { 'C':np.arange(1, 5, 2), 'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')}
    log_clf = GridSearchCV(log, log_parameters, cv=5, n_jobs=-1)  #寻找最佳参数，这会很慢，如果不想使用，可以自己修改：把log_clf改为log
    #log.fit(xtrain, ytrain)    
    log_clf.fit(xtrain, ytrain)
    print(log_clf.best_params_)
    log_model = log_clf.best_estimator_
    loss = multiclass_logloss(yvalid, log_model.predict_proba(xvalid)) # 模型的loss
    result = log_model.predict(xtest)
    
    return result,loss

In [36]:
def bulid_SVM(xtrain,xvalid,ytrain,yvalid,xtest):
    svc = SVC()
    svc.probability = True
    
    svc_parameters = {'kernel':('linear', 'rbf'), 'C':np.arange(1, 10, 2), 'gamma':np.arange(0.125, 4, 0.5)}
    svc_clf = GridSearchCV(svc, svc_parameters, cv=5, n_jobs=-1)
    
    svc_clf.fit(xtrain, ytrain)
    print(svc_clf.best_params_)
    svc_model = svc_clf.best_estimator_
    loss = multiclass_logloss(yvalid, svc_model.predict_proba(xvalid))
    result = svc_model.predict(xtest)

    return result,loss

In [37]:
def bulid_RF(xtrain,xvalid,ytrain,yvalid,xtest):
    rf = RandomForestClassifier()
    
    rf_parameters = {'n_estimators':np.arange(35,50,3), 'max_depth':np.arange(4,9,2), 'min_samples_split':np.arange(30,50,5),
                    'min_samples_leaf':np.arange(1,15,3),'max_features':np.arange(0.2,1,0.2)}
    rf_clf = GridSearchCV(rf, rf_parameters, cv=5, n_jobs=-1)
    
    rf_clf.fit(xtrain, ytrain)
    print(rf_clf.best_params_)
    rf_model = rf_clf.best_estimator_
    loss = multiclass_logloss(yvalid, rf_model.predict_proba(xvalid))
    result = rf_model.predict(xtest)
    
    return result, loss

In [39]:
def bulid_xgb(xtrain,xvalid,ytrain,yvalid,xtest):
    xgb_clf = xgb.XGBClassifier(nthread=10, learning_rate=0.1)
    
    xgb_parameters = {'max_depth':np.arange(1,9,2), 'n_estimators':np.arange(1,301,100), 
                      'colsample_bytree' : np.arange(0.3,1,0.3),}
    xgb_Gclf = GridSearchCV(xgb_clf, xgb_parameters, cv=5, n_jobs=-1)
    
    xgb_Gclf.fit(xtrain, ytrain)
    print(xgb_Gclf.best_params_)
    xgb_model = xgb_Gclf.best_estimator_
    loss = multiclass_logloss(yvalid, xgb_model.predict_proba(xvalid))
    result = rf_model.predict(xtest)
    return result, loss

try one feature

In [40]:
LR_meta_testY, LR_meta_loss = bulid_LR(meta_trainX, meta_validX, meta_trainY, meta_validY, meta_testX)



{'C': 3, 'solver': 'newton-cg'}


In [41]:
SVM_meta_testY, SVM_meta_loss = bulid_SVM(meta_trainX, meta_validX, meta_trainY, meta_validY, meta_testX)



KeyboardInterrupt: 

In [None]:
RF_meta_testY, RF_meta_loss = bulid_RF(meta_trainX, meta_validX, meta_trainY, meta_validY, meta_testX)

In [None]:
xgb_meta_testY, xgb_meta_loss = bulid_xgb(meta_trainX, meta_validX, meta_trainY, meta_validY, meta_testX)

deep learning

In [None]:
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from tensorflow.python.keras import utils
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

deep learning

In [None]:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [None]:
ytrain_enc = utils.to_categorical(ytrain)  #transform to one-hot vector
yvalid_enc = utils.to_categorical(yvalid)

In [None]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 895

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
xtrain_pad[0].shape

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(9))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

too much dropout, it starts to overfit: val_loss is much bigger than train loss. Try Bi-directional LSTM

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(9))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

GRU

In [None]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(9))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

In [None]:
model.predicate