# Who wrote this : a framework for French novelist identification

In [15]:
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/My Drive/who-wrote-this/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/My Drive/who-wrote-this/'
/content/drive/My Drive/who-wrote-this


In [0]:
# !pip install --upgrade gensim
# !pip install unidecode
# !pip install transformers

In [0]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
import gensim
from gensim.models import Doc2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from transformers import AutoModel, AutoTokenizer

In [0]:
# Number of available cores for parallel computing
N_CORES = cpu_count()

## Data loading

In [0]:
# Import train data
train_df = pd.read_csv('data/corpus_train.csv', sep='|')
train_df = train_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)
N_CLASSES = len(np.unique(y_train))

In [0]:
# Import test data and build validation dataset
test_df = pd.read_csv('data/corpus_test.csv', sep='|')
test_df = test_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_val, X_test, y_val, y_test = train_test_split(test_df['paragraph'].values,
                                                test_df['author'].values,
                                                test_size=0.5, random_state=42)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

## Baseline : TF-IDF + Logistic Regression

In [0]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LogisticRegression(max_iter=10000)

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

In [0]:
# Keep sklearn preprocessing pipeline for later
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Preprocessing + training
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)

In [12]:
# Compute predictions and validation score
y_val_pred_tfidf = tfidf_pipeline.predict(X_val)
tfidf_val_score = f1_score(y_val, y_val_pred_tfidf, average='micro')
print('F1 score on validation set with TF-IDF :', 
      tfidf_val_score.round(2))

F1 score on validation set with TF-IDF : 0.52


## Averaging of FastText pre-trained word vectors + Logistic Regression

In [13]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('models/fasttext.fr.300.bin')

  """Entry point for launching an IPython kernel.


In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
# Prediction pipeline
clf = LogisticRegression(max_iter=10000)
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [19]:
# Compute predictions and test score
y_val_pred_fasttext = fasttext_pipeline.predict(X_val)
val_score_fasttext = f1_score(y_val, y_val_pred_fasttext, average='micro')
print('F1 score on test set with pre-trained FastText + averaging :',
      val_score_fasttext.round(2))

F1 score on test set with pre-trained FastText + averaging : 0.35


## Doc2Vec + Logistic Regression

In [0]:
def build_d2v_corpus(corpus, tokens_only=False):
    """Tokenize and build corpus as expected by Gensim Doc2Vec class."""
    corpus_tokenized = []
    for i, text in enumerate(corpus):
        tokens = preprocessor(text)
        if tokens_only:
            corpus_tokenized.append(tokens)
        else:
            corpus_tokenized.append(TaggedDocument(tokens, [i]))
    return corpus_tokenized

In [0]:
# Format train and validation corpus as required by Doc2Vec
corpus_train_d2v = build_d2v_corpus(X_train, tokens_only=False)
corpus_val_d2v = build_d2v_corpus(X_val, tokens_only=True)

In [23]:
# Train Doc2Vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=N_CORES)
model.build_vocab(corpus_train_d2v)
model.train(corpus_train_d2v, total_examples=model.corpus_count, 
            epochs=model.epochs)

KeyboardInterrupt: ignored

In [0]:
# Compute document vectors on train and validation sets
X_train_d2v = np.array([model.infer_vector(doc.words) for doc in corpus_train_d2v])
X_val_d2v = np.array([model.infer_vector(doc) for doc in corpus_val_d2v])

In [0]:
# Compute predictions and validation score
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train_d2v, y_train)
y_val_pred_d2v = clf.predict(X_val_d2v)
val_score_d2v = f1_score(y_val, y_val_pred_d2v, average='micro')
print('F1 score on validation set with Doc2Vec :', 
      val_score_d2v.round(2))

## CamemBERT

In [0]:
BERT_MODEL_NAME = 'camembert-base'

In [0]:
class CorpusToTorchDataset(Dataset):
    """Convert corpus to tensors of token indices in CamemBERT vocabulary."""
    def __init__(self, corpus, labels, model_name=BERT_MODEL_NAME, maxlen=100):
        self.corpus = corpus
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        # Select instance
        sentence = self.corpus[index]
        label = self.labels[index]

        # Preprocess data as required by BERT models
        tokens = self.tokenizer.tokenize(sentence)
        bos_token = self.tokenizer.cls_token
        eos_token = self.tokenizer.sep_token
        pad_token = self.tokenizer.pad_token
        # Insert CLS and SEP tokens at beginning and end of sentence
        tokens = [bos_token] + tokens + [eos_token]
        if len(tokens) < self.maxlen:
            # If sentence is shorter than maxlen, pad sentence using special 
            # padding token
            tokens = tokens + [pad_token for _ in range(self.maxlen - len(tokens))]
        else:
            # Cut the sentence if it is longer than maxlen
            tokens = tokens[:self.maxlen-1] + [eos_token]

        # Convert tokens to tensor of indices in CamemBERT vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
        # Get attention mask to distinguish padding tokens from actual tokens
        pad_token_id = self.tokenizer.convert_tokens_to_ids(pad_token)
        attn_mask = (tokens_ids_tensor != pad_token_id).long()

        return tokens_ids_tensor, attn_mask, label

In [0]:
# Create instances of training and validation dataloaders
train_set = CorpusToTorchDataset(X_train, y_train)
train_loader = DataLoader(train_set, batch_size = 12, num_workers = N_CORES)

val_set = CorpusToTorchDataset(X_val, y_val)
val_loader = DataLoader(val_set, batch_size = 12, num_workers = N_CORES)

In [0]:
class CamemBERTClassifier(nn.Module):
    """Perform fine-tuning and classification using CamemBERT."""
    def __init__(self, pretrained_model_name=BERT_MODEL_NAME):
        super(CamemBERTClassifier, self).__init__()
        # Load CamemBERT
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        # Add an extra dense layer to perform classification
        self.cls_layer = nn.Linear(self.encoder.pooler.dense.out_features, N_CLASSES)

    def forward(self, seq, attn_masks):
        # Feed input to BERT model to obtain contextualized representations
        cont_reps, _ = self.encoder(seq, attention_mask = attn_masks)
        # Get representation of [CLS] head
        cls_rep = cont_reps[:, 0]
        # Feed document representation to the classifying layer
        logits = self.cls_layer(cls_rep)

        return logits

In [0]:
# Instantiate CamemBERT classifier model
camembert_clf = CamemBERTClassifier()

In [0]:
# Define loss and optimizer
criterion = CrossEntropyLoss()
opti = Adam(camembert_clf.parameters(), lr = 3e-4)

In [0]:
def train(model, criterion, opti, train_loader, val_loader, max_eps=10, 
          gpu=True, print_every=100, validate_every=1, break_training_after=None):
    if gpu:
        model = model.to("cuda")
    batch_accuracies = []
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            # clear gradients
            opti.zero_grad()  
            # transfer tensors to GPU
            if gpu:
                seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            # compute logits
            logits = model(seq, attn_masks)
            # compute loss
            loss = criterion(logits, labels)
            # backpropagation
            loss.backward()
            # optimization step
            opti.step()

            batch_acc = torch.sum(torch.argmax(logits, 1) == labels) / float(labels.size(0))
            batch_accuracies.append(batch_acc.item())

            if (it + 1) % print_every == 0:
                mean_batch_acc = np.array(batch_accuracies).mean()
                print("Iteration {} of epoch {} complete. Mean batch accuracy : {}".format(it+1, ep+1, mean_batch_acc))

        if ep % validate_every == 0:
            # evaluation on the validation set
            n_batch_validation = 0
            loss_validation = 0
            accuracy_validation = 0
            for it, (seq, attn_masks, labels) in enumerate(val_loader):
                if gpu:
                    seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
                # compute logits
                logits_val = model(seq, attn_masks)
                n_batch_validation+=1
                # compute loss and accuracy
                _loss = float(criterion(logits_val, labels))
                loss_validation += _loss
                _accu = torch.sum(torch.argmax(logits, 1) == labels) / float(labels.size(0))
                accuracy_validation += _accu

            print("EVALUATION Validation set : mean loss {} n mean accuracy {}".format(loss_validation/n_batch_validation, accuracy_validation/n_batch_validation))



In [57]:
train(camembert_clf, criterion, opti, train_loader, val_loader,
      max_eps=5, gpu=True, print_every=10, validate_every=1)

Iteration 10 of epoch 1 complete. Mean batch accuracy : 0.2750000074505806
Iteration 20 of epoch 1 complete. Mean batch accuracy : 0.2500000063329935
Iteration 30 of epoch 1 complete. Mean batch accuracy : 0.2750000057121118
Iteration 40 of epoch 1 complete. Mean batch accuracy : 0.27291667331010105
Iteration 50 of epoch 1 complete. Mean batch accuracy : 0.27500000640749933
Iteration 60 of epoch 1 complete. Mean batch accuracy : 0.2875000076989333
Iteration 70 of epoch 1 complete. Mean batch accuracy : 0.2845238168324743
Iteration 80 of epoch 1 complete. Mean batch accuracy : 0.2802083405666053
Iteration 90 of epoch 1 complete. Mean batch accuracy : 0.27962963672147856
Iteration 100 of epoch 1 complete. Mean batch accuracy : 0.27583334036171436
Iteration 110 of epoch 1 complete. Mean batch accuracy : 0.2856060678985986
Iteration 120 of epoch 1 complete. Mean batch accuracy : 0.2854166740551591
Iteration 130 of epoch 1 complete. Mean batch accuracy : 0.2871794948211083
Iteration 140 of 

KeyboardInterrupt: ignored