# Who wrote this : a framework for French novelist identification

In [0]:
# Download necessary additional libraries
!pip install unidecode
!pip install transformers
!python -m spacy download fr_core_news_md

In [0]:
import os
import re
import random
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib.request
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, confusion_matrix
import spacy
import fr_core_news_md
import gensim
from gensim.models import Doc2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from transformers import AutoModel, AutoTokenizer

In [0]:
# Number of available cores for parallel computing
N_CORES = cpu_count()

## Data loading

In [0]:
# Download data from the GitHub repository
!wget https://raw.githubusercontent.com/meteve/NLP_project/master/data/corpus_train.csv
!wget https://raw.githubusercontent.com/meteve/NLP_project/master/data/corpus_test.csv

In [0]:
# Import train data
train_df = pd.read_csv('corpus_train.csv', sep='|')
train_df = train_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)
N_CLASSES = len(np.unique(y_train))

In [0]:
# Import test data and build validation dataset
test_df = pd.read_csv('corpus_test.csv', sep='|')
test_df = test_df.sample(frac=1).reset_index(drop=True) # Shuffle
X_val, X_test, y_val, y_test = train_test_split(test_df['paragraph'].values,
                                                test_df['author'].values,
                                                test_size=0.5, random_state=42)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

In [7]:
train_df.head(5)

Unnamed: 0,paragraph,author
0,"--Ah parbleu! parce que vos ambassadeurs, vos ...",Dumas
1,"Ce qu'il y avait de plus pressé, c'était d'enl...",Dumas
2,Puis il se mit à jouer. Il eut un moment ravis...,Hugo
3,"--Est-il vrai, ma tante, dit la jeune Martine ...",Vigny
4,Le fiacre roulait le long des arbres des Champ...,Hugo


# Training

### Baseline : TF-IDF

In [0]:
URL = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt'
response = urllib.request.urlopen(URL)
stopwords = response.read().decode('utf-8').splitlines()
stopwords = [unidecode.unidecode(x) for x in stopwords]
stopwords.append('quelqu') # Make stopwords consistent with scikit tokenizer

In [0]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer(stop_words=stopwords)
clf = LogisticRegression(max_iter=10000)

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('clf', clf)
])

In [0]:
# Keep sklearn preprocessing pipeline for later
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Preprocessing + training
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)

In [12]:
# Compute predictions and validation score
y_val_pred_tfidf = tfidf_pipeline.predict(X_val)
tfidf_val_score = f1_score(y_val, y_val_pred_tfidf, average='micro')
print('F1 score on validation set with TF-IDF :', 
      tfidf_val_score.round(2))

F1 score on validation set with TF-IDF : 0.47


In [13]:
# Compute predictions and score on train set
y_train_pred_tfidf = tfidf_pipeline.predict(X_train)
tfidf_train_score = f1_score(y_train, y_train_pred_tfidf, average='micro')
print('F1 score on train set with TF-IDF :', 
      tfidf_train_score.round(2))

F1 score on train set with TF-IDF : 0.92


The very high gap between train and validation scores indicate that the baseline overfits the training data. It may be due to the fact that there are different books in the train and test sets. Some locations and characters which are present in the books of the training set may have a strong influence on the training of the model. To verify this, we compute tf-idf weights for the well classified paragraphs of the train set, and look at the most important words for each author in terms of tf-idf weights.  

In [0]:
def top_tfidf_feats(row, features, top_n):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [0]:
def get_tfidf_weights_by_author(author, NER=False, top_n=20):
    '''Return words with high tfidf weights, for well-classified paragraphs of an author with or without NER'''
    y_true = y_train
    
    if NER == True:
        y_pred = y_train_pred_tfidf_ner
        pipe = tfidf_pipeline_ner
        X = X_train_ner
    else:
        y_pred = y_train_pred_tfidf
        pipe = tfidf_pipeline
        X = X_train
        
    
    if type(author) == str:
        author_index = list(le.classes_).index(author)
    elif type(author) == int:
        author_index = author
   
    # keep well classified paragraphs of the auhtor
    well_classif_indexes = ((y_true == author_index) & (y_pred == author_index))
    X_train_well_classif = X[well_classif_indexes]
    
    # get tf-idf vectors
    vec = tfidf_pipeline.named_steps['tf-idf']
    Xtr = vec.transform(X_train_well_classif)
    features = vec.get_feature_names()
    
    # get mean tf-idf scores
    D = Xtr.toarray()
    tfidf_means = np.mean(D, axis=0)
    
    return top_tfidf_feats(tfidf_means, features, top_n)

In [0]:
def get_tfidf_weights_by_author(author, y_true, y_pred, X, top_n=20):
    '''Return words with high tfidf weights, for well-classified paragraphs of an author with or without NER'''        
    
    if type(author) == str:
        author_index = list(le.classes_).index(author)
    elif type(author) == int:
        author_index = author
   
    # keep well classified paragraphs of the auhtor
    well_classif_indexes = ((y_true == author_index) & (y_pred == author_index))
    X_train_well_classif = X[well_classif_indexes]
    
    # get tf-idf vectors
    vec = tfidf_pipeline.named_steps['tf-idf']
    Xtr = vec.transform(X_train_well_classif)
    features = vec.get_feature_names()
    
    # get mean tf-idf scores
    D = Xtr.toarray()
    tfidf_means = np.mean(D, axis=0)
    
    return top_tfidf_feats(tfidf_means, features, top_n)

In [17]:
get_tfidf_weights_by_author(author='Maupassant', y_true=y_train, y_pred=y_train_pred_tfidf, X=X_train)

Unnamed: 0,feature,tfidf
0,jeanne,0.02032
1,duroy,0.016129
2,était,0.013896
3,femme,0.01118
4,forestier,0.010246
5,walter,0.01004
6,ça,0.009915
7,point,0.009565
8,là,0.008472
9,où,0.008278


A relatively high number of proper nouns have a high tf-idf weight in the previous model. Next, we try to remove them using Named Entity Recognition (NER) procedure in order to evaluate whether it reduces overfitting. 

### NER

In [0]:
nlp = fr_core_news_md.load()

In [0]:
def remove_named_entities(paragraph): 
  """remove the named entities from a paragraph"""
  doc = nlp(paragraph)
  names = []
  for ent in doc.ents:
      # list of all the named entities recognized in the paragraph
      names.append(ent)
  # keep unique elements and convert to string    
  names = set([str(x) for x in names])
    
  # remove named entities from the paragraph
  paragraph_no_names = paragraph
  for name in names:
      paragraph_no_names = paragraph_no_names.replace(name, '')
        
  return paragraph_no_names

In [0]:
X_train_ner = []

for i, par in enumerate(X_train):
  par_ner = remove_named_entities(par)
  X_train_ner.append(par_ner)

X_train_ner = np.array(X_train_ner)

In [21]:
# Verify that proper nouns removal works
_ = tfidf_pipeline.fit(X_train_ner, y_train)
y_train_pred_tfidf_ner = tfidf_pipeline.predict(X_train_ner)
get_tfidf_weights_by_author(author='Maupassant', y_true=y_train, y_pred=y_train_pred_tfidf_ner, X=X_train_ner)

Unnamed: 0,feature,tfidf
0,était,0.012927
1,femme,0.010957
2,point,0.010555
3,là,0.008805
4,baron,0.008727
5,mère,0.008721
6,bras,0.008718
7,où,0.008603
8,semblait,0.008376
9,petite,0.007993


In [22]:
# Compute score on train set AFTER proper nouns removal procedure
tfidf_train_score_ner = f1_score(y_train, y_train_pred_tfidf_ner, average='micro')
print('F1 score on train set with TF-IDF after proper nouns removal:', 
      tfidf_train_score_ner.round(2))

F1 score on train set with TF-IDF after proper nouns removal: 0.82


The train score is much lower after removing the proper nouns in the paragraphs. The overfitting was partially due to the presence of such proper nouns. 

In [0]:
X_val_ner = []

for i, par in enumerate(X_val):
  par_ner = remove_named_entities(par)
  X_val_ner.append(par_ner)

X_val_ner = np.array(X_val_ner)

In [24]:
# Compute predictions and validation score AFTER proper nouns removal procedure
y_val_pred_tfidf_ner = tfidf_pipeline.predict(X_val_ner)
tfidf_val_score_ner = f1_score(y_val, y_val_pred_tfidf_ner, average='micro')
print('F1 score on validation set with TF-IDF after proper nouns removal:', 
      tfidf_val_score_ner.round(2))

F1 score on validation set with TF-IDF after proper nouns removal: 0.42


The validation score is also reduced when we remove the proper nouns of the paragraphs. This suggests that some names may appear in different books of the same author, and can actually be helpful for the predictions. Thus, we decide to keep proper nouns to train the other models.

### FastText (averaging of pre-trained word vectors)

In [0]:
# Download and extract FastText French word vectors
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz
!gunzip cc.fr.300.bin.gz

In [0]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('cc.fr.300.bin')

In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
# Prediction pipeline
clf = LogisticRegression(max_iter=10000)
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('clf', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [32]:
# Compute predictions and test score
y_val_pred_fasttext = fasttext_pipeline.predict(X_val)
val_score_fasttext = f1_score(y_val, y_val_pred_fasttext, average='micro')
print('F1 score on validation set :',
      val_score_fasttext.round(2))

F1 score on validation set : 0.33


### Doc2Vec

In [0]:
def build_d2v_corpus(corpus, tokens_only=False):
    """Tokenize and build corpus as expected by Gensim Doc2Vec class."""
    corpus_tokenized = []
    for i, text in enumerate(corpus):
        tokens = preprocessor(text)
        if tokens_only:
            corpus_tokenized.append(tokens)
        else:
            corpus_tokenized.append(TaggedDocument(tokens, [i]))
    return corpus_tokenized

In [0]:
# Format train and validation corpus as required by Doc2Vec
corpus_train_d2v = build_d2v_corpus(X_train, tokens_only=False)
corpus_val_d2v = build_d2v_corpus(X_val, tokens_only=True)

In [0]:
# Train Doc2Vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=N_CORES)
model.build_vocab(corpus_train_d2v)
model.train(corpus_train_d2v, total_examples=model.corpus_count, 
            epochs=model.epochs)

In [0]:
# Compute document vectors on train and validation sets
X_train_d2v = np.array([model.infer_vector(doc.words) for doc in corpus_train_d2v])
X_val_d2v = np.array([model.infer_vector(doc) for doc in corpus_val_d2v])

In [37]:
# Compute predictions and validation score
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train_d2v, y_train)
y_val_pred_d2v = clf.predict(X_val_d2v)
val_score_d2v = f1_score(y_val, y_val_pred_d2v, average='micro')
print('F1 score on validation set :', 
      val_score_d2v.round(2))

F1 score on validation set : 0.37


### CamemBERT

In [0]:
class CorpusToTorchDataset(Dataset):
    """Convert corpus to tensors of token indices in CamemBERT vocabulary."""
    def __init__(self, corpus, labels, model_name, maxlen=100):
        self.corpus = corpus
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        # Select instance
        sentence = self.corpus[index]
        label = self.labels[index]

        # Preprocess data as required by BERT models
        tokens = self.tokenizer.tokenize(sentence)
        bos_token = self.tokenizer.cls_token
        eos_token = self.tokenizer.sep_token
        pad_token = self.tokenizer.pad_token
        # Insert CLS and SEP tokens at beginning and end of sentence
        tokens = [bos_token] + tokens + [eos_token]
        if len(tokens) < self.maxlen:
            # If sentence is shorter than maxlen, pad sentence using special 
            # padding token
            tokens = tokens + [pad_token for _ in range(self.maxlen - len(tokens))]
        else:
            # Cut the sentence if it is longer than maxlen
            tokens = tokens[:self.maxlen-1] + [eos_token]

        # Convert tokens to tensor of indices in CamemBERT vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)
        # Get attention mask to distinguish padding tokens from actual tokens
        pad_token_id = self.tokenizer.convert_tokens_to_ids(pad_token)
        attn_mask = (tokens_ids_tensor != pad_token_id).long()

        return tokens_ids_tensor, attn_mask, label

In [0]:
# Create instances of training and validation dataloaders

BERT_MODEL_NAME = 'camembert-base'
MAXLEN = 100
BATCH_SIZE = 12

train_set = CorpusToTorchDataset(X_train, y_train, model_name=BERT_MODEL_NAME, maxlen=MAXLEN)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=N_CORES)

val_set = CorpusToTorchDataset(X_val, y_val, model_name=BERT_MODEL_NAME, maxlen=MAXLEN)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=N_CORES)

In [0]:
class CamemBERTClassifier(nn.Module):
    """Perform fine-tuning and classification using CamemBERT."""
    def __init__(self, pretrained_model_name=BERT_MODEL_NAME):
        super(CamemBERTClassifier, self).__init__()
        # Load CamemBERT
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        # Add an extra dense layer to perform classification
        self.cls_layer = nn.Linear(self.encoder.pooler.dense.out_features, N_CLASSES)

    def forward(self, seq, attn_masks):
        # Feed input to BERT model to obtain contextualized representations
        cont_reps, _ = self.encoder(seq, attention_mask=attn_masks)
        # Get representation of [CLS] head
        cls_rep = cont_reps[:, 0]
        # Feed document representation to the classifying layer
        logits = self.cls_layer(cls_rep)

        return logits

In [0]:
# Instantiate CamemBERT classifier model
camembert_clf = CamemBERTClassifier()

In [0]:
# Define loss and optimizer
criterion = CrossEntropyLoss()
opti = Adam(camembert_clf.parameters(), lr=3e-5)

In [0]:
def train(model, criterion, opti, train_loader, val_loader, max_eps=3, 
          gpu=True, print_every=100, validate_every=1):
    """Train a transformer model and compute loss on validation data."""
    if gpu:
        model = model.to("cuda")
    # Unfreeze weights to allow fine tuning
    model.train() 

    train_loss_total = 0
    n_batch_train = 0
    for ep in range(max_eps):
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            # Clear gradients (avoid accumulation)
            opti.zero_grad()  
            # Transfer tensors to GPU
            if gpu:
                seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            # Compute logits
            logits = model(seq, attn_masks)
            # Compute batch loss
            loss = criterion(logits, labels)
            # Backpropagation
            loss.backward()
            # Optimization step
            opti.step()
            # Accumulate train loss over batches
            train_loss_total += loss.item()
            n_batch_train += 1

            # Compute average loss over the last `print_every` training batches
            if print_every is not None and (it + 1) % print_every == 0:
                print(f'Epoch {ep+1}, batch {it+1}. Average loss over last {print_every} training batches : {train_loss_total/n_batch_train}')
                # Reinitialize accumulators
                train_loss_total = 0
                n_batch_train = 0

        if validate_every is not None and ep % validate_every == 0:
            # Evaluation on the validation set

            predictions_val = []
            true_labels_val = []
            for it, (seq, attn_masks, labels) in enumerate(val_loader):
                if gpu:
                    seq, attn_masks = seq.cuda(), attn_masks.cuda()
                # Compute logits without constructing the computing graph
                # (only needed for backprop)
                with torch.no_grad():
                    logits_val = model(seq, attn_masks)
                preds_batch = torch.argmax(logits_val, 1).cpu().numpy()
                predictions_val.extend(preds_batch)
                true_labels_val.extend(labels.numpy())

            val_f1 = f1_score(true_labels_val, predictions_val, average='micro')
            print('------------------------------------------------------------')
            print("Epoch {} complete. F1 score on validation data : {}".format(ep+1, val_f1))
            print('------------------------------------------------------------')

In [44]:
# Train CamemBERT
train(camembert_clf, criterion, opti, train_loader, val_loader,
      max_eps=1, # Systematic overtraining after 1 epoch which degrades results
      gpu=True, 
      print_every=500, validate_every=1)

Epoch 1, batch 500. Average loss over last 500 training batches : 1.6024817779064178
Epoch 1, batch 1000. Average loss over last 500 training batches : 0.979894129216671
Epoch 1, batch 1500. Average loss over last 500 training batches : 0.7141468065679073
Epoch 1, batch 2000. Average loss over last 500 training batches : 0.5915717537403107
Epoch 1, batch 2500. Average loss over last 500 training batches : 0.517938755877316
Epoch 1, batch 3000. Average loss over last 500 training batches : 0.468482445307076
Epoch 1, batch 3500. Average loss over last 500 training batches : 0.4357746082171798
------------------------------------------------------------
Epoch 1 complete. F1 score on validation data : 0.4962450129077681
------------------------------------------------------------


# Final evaluation

### Quantitative evaluation: F1 score

In [0]:
# TFIDF

y_test_pred_tfidf = tfidf_pipeline.predict(X_test)
test_score_tfidf = f1_score(y_test, y_test_pred_tfidf, average='micro')

In [0]:
# FastText

y_test_pred_fasttext = fasttext_pipeline.predict(X_test)
test_score_fasttext = f1_score(y_test, y_test_pred_fasttext, average='micro')

In [0]:
# doc2vec

corpus_test_d2v = build_d2v_corpus(X_test, tokens_only=True)
X_test_d2v = np.array([model.infer_vector(doc) for doc in corpus_test_d2v])
y_test_pred_d2v = clf.predict(X_test_d2v)
test_score_d2v = f1_score(y_test, y_test_pred_d2v, average='micro')

In [0]:
# CamemBERT

# Create instance of test dataloader
test_set = CorpusToTorchDataset(X_test, y_test, model_name=BERT_MODEL_NAME, maxlen=MAXLEN)
test_loader = DataLoader(test_set, batch_size=12, num_workers=N_CORES)

def eval(model, test_loader, gpu=True):
    """Compute predictions of a trained transforme model on test data."""
    # Put model in evaluation mode
    model.eval()

    predictions = []
    for it, (seq, attn_masks, labels) in enumerate(test_loader):
        if gpu:
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
        # Compute logits without constructing the computing graph
        # (only needed for backprop)
        with torch.no_grad():
            logits = model(seq, attn_masks)
        # Compute predictions from logits and store them as arrays
        preds_batch = torch.argmax(logits, 1).cpu().numpy()
        predictions.extend(preds_batch)

    return predictions

y_test_pred_camembert = eval(camembert_clf, test_loader)
test_score_camembert = f1_score(y_test, y_test_pred_camembert, average='micro')

In [49]:
# Compare performances
models = ['TF-IDF', 'FastText', 'Doc2Vec', 'CamemBERT']
scores = [test_score_tfidf, test_score_fasttext, test_score_d2v, test_score_camembert]
df_scores = pd.DataFrame(zip(models, scores), columns=['Model', 'F1 score'])
df_scores.sort_values('F1 score', ascending=False)

Unnamed: 0,Model,F1 score
3,CamemBERT,0.503813
0,TF-IDF,0.444092
2,Doc2Vec,0.387657
1,FastText,0.348117


We find that CamemBERT has the highest F1 score. In order to make a qualitative evaluation of this model, we build the confusion matrix based on the predictions of CamemBERT on the test set.

### Qualitative evaluation: confusion matrix

In [0]:
def get_conf_matrix_df(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    conf_matrix = pd.DataFrame(conf_matrix, columns=list(le.classes_), index=list(le.classes_))
    
    row_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[0:10,i]), 3)
        row_percentages.append(perc)
    
    row_percentages = pd.DataFrame([row_percentages], columns=list(le.classes_))
    
    col_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[i,0:10]), 3)
        col_percentages.append(perc)
    col_percentages.append(None)
    
    conf_matrix = conf_matrix.append(row_percentages)
    conf_matrix = conf_matrix.rename(index={0: 'PRECISION'})
    conf_matrix['RECALL'] = col_percentages

    return(conf_matrix)

In [0]:
conf_matrix_BERT = get_conf_matrix_df(y_test, y_test_pred_camembert)

In [52]:
conf_matrix_BERT

Unnamed: 0,Balzac,Daudet,Dumas,Flaubert,Hugo,Maupassant,Stendhal,Verne,Vigny,Zola,RECALL
Balzac,74.0,20.0,52.0,17.0,30.0,5.0,352.0,489.0,9.0,35.0,0.068
Daudet,5.0,97.0,52.0,64.0,22.0,45.0,9.0,22.0,2.0,24.0,0.284
Dumas,34.0,6.0,2091.0,19.0,49.0,53.0,33.0,21.0,4.0,10.0,0.901
Flaubert,1.0,4.0,20.0,464.0,14.0,48.0,11.0,7.0,1.0,38.0,0.763
Hugo,10.0,31.0,79.0,48.0,202.0,23.0,1072.0,9.0,62.0,5.0,0.131
Maupassant,16.0,42.0,34.0,20.0,22.0,138.0,5.0,22.0,1.0,18.0,0.434
Stendhal,1.0,18.0,62.0,6.0,13.0,6.0,530.0,2.0,1.0,16.0,0.809
Verne,2.0,4.0,148.0,12.0,29.0,6.0,2.0,197.0,1.0,10.0,0.479
Vigny,16.0,95.0,56.0,6.0,27.0,13.0,23.0,15.0,16.0,0.0,0.06
Zola,61.0,24.0,145.0,79.0,8.0,89.0,49.0,38.0,0.0,485.0,0.496
