# Who wrote this : a framework for French novelist identification

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/My Drive/who-wrote-this/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/who-wrote-this


In [2]:
# !pip install --upgrade gensim
!pip install unidecode
!pip install transformers



In [2]:
import os
import re
import random
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
import gensim
from gensim.models import Doc2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from transformers import AutoModel, AutoTokenizer

In [0]:
# Number of available cores for parallel computing
N_CORES = cpu_count()

## Data loading

In [0]:
# Import train data
train_df = pd.read_csv('data/corpus_train.csv', sep='|')
train_df = train_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)
N_CLASSES = len(np.unique(y_train))

In [0]:
# Import test data and build validation dataset
test_df = pd.read_csv('data/corpus_test.csv', sep='|')
test_df = test_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_val, X_test, y_val, y_test = train_test_split(test_df['paragraph'].values,
                                                test_df['author'].values,
                                                test_size=0.5, random_state=42)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

In [6]:
train_df.head(5)

Unnamed: 0,paragraph,author
0,"Alors, dans cette première rafale, Gervaise, r...",Zola
1,"--Il est possible, d'ailleurs, que nous nous r...",Verne
2,"Fumée âcre et étouffante où se traînaient, ave...",Hugo
3,"-- Nous allons chez l'abbé Scarron, pour leque...",Dumas
4,Le lecteur trouve cette conversation longue; p...,Stendhal


# Training

### Baseline : TF-IDF

In [0]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LogisticRegression(max_iter=10000)

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('clf', clf)
])

In [0]:
# Keep sklearn preprocessing pipeline for later
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Preprocessing + training
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)

In [34]:
# Compute predictions and validation score
y_val_pred_tfidf = tfidf_pipeline.predict(X_val)
tfidf_val_score = f1_score(y_val, y_val_pred_tfidf, average='micro')
print('F1 score on validation set with TF-IDF :', 
      tfidf_val_score.round(2))

F1 score on validation set with TF-IDF : 0.51


### FastText (averaging of pre-trained word vectors)

In [0]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('models/fasttext.fr.300.bin')

In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
# Prediction pipeline
clf = LogisticRegression(max_iter=10000)
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('clf', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [41]:
# Compute predictions and test score
y_val_pred_fasttext = fasttext_pipeline.predict(X_val)
val_score_fasttext = f1_score(y_val, y_val_pred_fasttext, average='micro')
print('F1 score on validation set :',
      val_score_fasttext.round(2))

F1 score on validation set : 0.35


### Doc2Vec

In [0]:
def build_d2v_corpus(corpus, tokens_only=False):
    """Tokenize and build corpus as expected by Gensim Doc2Vec class."""
    corpus_tokenized = []
    for i, text in enumerate(corpus):
        tokens = preprocessor(text)
        if tokens_only:
            corpus_tokenized.append(tokens)
        else:
            corpus_tokenized.append(TaggedDocument(tokens, [i]))
    return corpus_tokenized

In [0]:
# Format train and validation corpus as required by Doc2Vec
corpus_train_d2v = build_d2v_corpus(X_train, tokens_only=False)
corpus_val_d2v = build_d2v_corpus(X_val, tokens_only=True)

In [0]:
# Train Doc2Vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=N_CORES)
model.build_vocab(corpus_train_d2v)
model.train(corpus_train_d2v, total_examples=model.corpus_count, 
            epochs=model.epochs)

In [0]:
# Compute document vectors on train and validation sets
X_train_d2v = np.array([model.infer_vector(doc.words) for doc in corpus_train_d2v])
X_val_d2v = np.array([model.infer_vector(doc) for doc in corpus_val_d2v])

In [46]:
# Compute predictions and validation score
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train_d2v, y_train)
y_val_pred_d2v = clf.predict(X_val_d2v)
val_score_d2v = f1_score(y_val, y_val_pred_d2v, average='micro')
print('F1 score on validation set :', 
      val_score_d2v.round(2))

F1 score on validation set : 0.39


### CamemBERT

In [0]:
class CorpusToTorchDataset(Dataset):
    """Convert corpus to tensors of token indices in CamemBERT vocabulary."""
    def __init__(self, corpus, labels, model_name, maxlen=100):
        self.corpus = corpus
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        # Select instance
        sentence = self.corpus[index]
        label = self.labels[index]

        # Preprocess data as required by BERT models
        tokens = self.tokenizer.tokenize(sentence)
        bos_token = self.tokenizer.cls_token
        eos_token = self.tokenizer.sep_token
        pad_token = self.tokenizer.pad_token
        # Insert CLS and SEP tokens at beginning and end of sentence
        tokens = [bos_token] + tokens + [eos_token]
        if len(tokens) < self.maxlen:
            # If sentence is shorter than maxlen, pad sentence using special 
            # padding token
            tokens = tokens + [pad_token for _ in range(self.maxlen - len(tokens))]
        else:
            # Cut the sentence if it is longer than maxlen
            tokens = tokens[:self.maxlen-1] + [eos_token]

        # Convert tokens to tensor of indices in CamemBERT vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
        # Get attention mask to distinguish padding tokens from actual tokens
        pad_token_id = self.tokenizer.convert_tokens_to_ids(pad_token)
        attn_mask = (tokens_ids_tensor != pad_token_id).long()

        return tokens_ids_tensor, attn_mask, label

In [0]:
# Create instances of training and validation dataloaders

BERT_MODEL_NAME = 'camembert-base'
MAXLEN = 100
BATCH_SIZE = 12

train_set = CorpusToTorchDataset(X_train, y_train, model_name=BERT_MODEL_NAME, maxlen=100)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=N_CORES)

val_set = CorpusToTorchDataset(X_val, y_val, model_name=BERT_MODEL_NAME, maxlen=100)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=N_CORES)

In [0]:
class CamemBERTClassifier(nn.Module):
    """Perform fine-tuning and classification using CamemBERT."""
    def __init__(self, pretrained_model_name=BERT_MODEL_NAME):
        super(CamemBERTClassifier, self).__init__()
        # Load CamemBERT
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        # Add an extra dense layer to perform classification
        self.cls_layer = nn.Linear(self.encoder.pooler.dense.out_features, N_CLASSES)

    def forward(self, seq, attn_masks):
        # Feed input to BERT model to obtain contextualized representations
        cont_reps, _ = self.encoder(seq, attention_mask = attn_masks)
        # Get representation of [CLS] head
        cls_rep = cont_reps[:, 0]
        # Feed document representation to the classifying layer
        logits = self.cls_layer(cls_rep)

        return logits

In [0]:
# Instantiate CamemBERT classifier model
camembert_clf = CamemBERTClassifier()

In [0]:
# Define loss and optimizer
criterion = CrossEntropyLoss()
opti = Adam(camembert_clf.parameters(), lr = 3e-5)

In [0]:
def train(model, criterion, opti, train_loader, val_loader, max_eps=3, 
          gpu=True, print_every=100, validate_every=1):
    """Train a transformer model and compute loss on validation data."""
    if gpu:
        model = model.to("cuda")
    # Unfreeze weights to allow fine tuning
    model.train() 

    train_loss_total = 0
    n_batch_train = 0
    for ep in range(max_eps):
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            # Clear gradients (avoid accumulation)
            opti.zero_grad()  
            # Transfer tensors to GPU
            if gpu:
                seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            # Compute logits
            logits = model(seq, attn_masks)
            # Compute batch loss
            loss = criterion(logits, labels)
            # Backpropagation
            loss.backward()
            # Optimization step
            opti.step()
            # Accumulate train loss over batches
            train_loss_total += loss.item()
            n_batch_train += 1

            # Compute average loss over the last `print_every` training batches
            if print_every is not None and (it + 1) % print_every == 0:
                print(f'Epoch {ep+1}, batch {it+1}. Average loss over last {print_every} training batches : {train_loss_total/n_batch_train}')
                # Reinitialize accumulators
                train_loss_total = 0
                n_batch_train = 0

        if validate_every is not None and ep % validate_every == 0:
            # Evaluation on the validation set

            predictions_val = []
            true_labels_val = []
            for it, (seq, attn_masks, labels) in enumerate(val_loader):
                if gpu:
                    seq, attn_masks = seq.cuda(), attn_masks.cuda()
                # Compute logits without constructing the computing graph
                # (only needed for backprop)
                with torch.no_grad():
                    logits_val = model(seq, attn_masks)
                preds_batch = torch.argmax(logits_val, 1).cpu().numpy()
                predictions_val.extend(preds_batch)
                true_labels_val.extend(labels.numpy())

            val_f1 = f1_score(true_labels_val, predictions_val, average='micro')
            print('------------------------------------------------------------')
            print("Epoch {} complete. F1 score on validation data : {}".format(ep+1, val_f1))
            print('------------------------------------------------------------')

In [29]:
# Enforce deterministic behavior to ensure reproducibility of the results
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train(camembert_clf, criterion, opti, train_loader, val_loader,
      max_eps=1, gpu=True, print_every=500, validate_every=1)

# Grid search (only 1 epoch each time, systematic overtraining above that) :
# lr = 5e-5 => val_f1 = 0.49
# lr = 3e-5 => val_f1 = 0.52

Epoch 1, batch 500. Average loss over last 500 training batches : 1.6164042003154755
Epoch 1, batch 1000. Average loss over last 500 training batches : 1.0134558594226837
Epoch 1, batch 1500. Average loss over last 500 training batches : 0.7353367038667202
Epoch 1, batch 2000. Average loss over last 500 training batches : 0.611015151232481
Epoch 1, batch 2500. Average loss over last 500 training batches : 0.5248016767948865
Epoch 1, batch 3000. Average loss over last 500 training batches : 0.4910539143830538
Epoch 1, batch 3500. Average loss over last 500 training batches : 0.4564877462014556
------------------------------------------------------------
Epoch 1 complete. F1 score on validation data : 0.5199483689274819
------------------------------------------------------------


# Final evaluation

In [0]:
# TFIDF

y_test_pred_tfidf = tfidf_pipeline.predict(X_test)
test_score_tfidf = f1_score(y_test, y_test_pred_tfidf, average='micro')

In [0]:
# FastText

y_test_pred_fasttext = fasttext_pipeline.predict(X_test)
test_score_fasttext = f1_score(y_test, y_test_pred_fasttext, average='micro')

In [0]:
# doc2vec

corpus_test_d2v = build_d2v_corpus(X_test, tokens_only=True)
X_test_d2v = np.array([model.infer_vector(doc) for doc in corpus_test_d2v])
y_test_pred_d2v = clf.predict(X_test_d2v)
test_score_d2v = f1_score(y_test, y_test_pred_d2v, average='micro')

In [0]:
# CamemBERT

# Create instance of test dataloader
test_set = CorpusToTorchDataset(X_test, y_test, model_name=BERT_MODEL_NAME, maxlen=100)
test_loader = DataLoader(test_set, batch_size = 12, num_workers = N_CORES)

def eval(model, test_loader, gpu=True):
    """Compute predictions of a trained transforme model on test data."""
    # Put model in evaluation mode
    model.eval()

    predictions = []
    for it, (seq, attn_masks, labels) in enumerate(test_loader):
        if gpu:
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
        # Compute logits without constructing the computing graph
        # (only needed for backprop)
        with torch.no_grad():
            logits = model(seq, attn_masks)
        # Compute predictions from logits and store them as arrays
        preds_batch = torch.argmax(logits, 1).cpu().numpy()
        predictions.extend(preds_batch)

    return predictions

y_test_pred_camembert = eval(camembert_clf, test_loader)
test_score_camembert = f1_score(y_test, y_test_pred_camembert, average='micro')

In [53]:
# Compare performances
models = ['TF-IDF', 'FastText', 'Doc2Vec', 'CamemBERT']
scores = [test_score_tfidf, test_score_fasttext, test_score_d2v, test_score_camembert]
df_scores = pd.DataFrame(zip(models, scores), columns=['Model', 'F1 score'])
df_scores.sort_values('F1 score', ascending=False)

Unnamed: 0,Model,F1 score
0,TF-IDF,0.515898
3,CamemBERT,0.515312
2,Doc2Vec,0.388948
1,FastText,0.350933
