<a href="https://colab.research.google.com/github/meteve/NLP_project/blob/master/scripts/who_wrote_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Who wrote this : a framework for French novelist identification

In [0]:
from google.colab import drive
drive.mount('/content/drive')
%cd 'drive/My Drive/who-wrote-this/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/My Drive/who-wrote-this/'
/content/drive/My Drive/who-wrote-this


In [None]:
# !pip install --upgrade gensim
!pip install unidecode
!pip install fairseq
!pip install sentencepiece



In [32]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from gensim.models import FastText
import torch
from fairseq.models.roberta import CamembertModel

ModuleNotFoundError: No module named 'gensim'

In [0]:
# Number of available cores for parallel computing
N_CORES = cpu_count()

## Data preprocessing

In [16]:
# Import train data
train_df = pd.read_csv('../data/corpus_train_complete.csv', index_col=0)
X_train_ner = train_df['paragraph_no_names'].values
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)

In [17]:
# Import test data
test_df = pd.read_csv('../data/corpus_test.csv', sep='|')
X_test = test_df['paragraph'].values
y_labels_test = test_df['author'].values
y_test = le.transform(y_labels_test)

In [44]:
import nbimporter
from exploratory_analysis import remove_named_entities

import spacy
import fr_core_news_md

In [47]:
paragraph_no_names = []
for i, par in enumerate(test_df['paragraph']):
    par_no_names = remove_named_entities(par)
    paragraph_no_names.append(par_no_names)
    if i%1000 == 0:
        print(i)

test_df['paragraph_no_names'] = paragraph_no_names

NameError: name 'nlp' is not defined

In [None]:
X_test_ner = test_df['paragraph_no_names'].values

In [18]:
def import_stopwords():
    """Download and import French stopwords."""
    URL = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt'
    response = urllib.request.urlopen(URL)
    stopwords = response.read().decode('utf-8').splitlines()
    stopwords = [unidecode.unidecode(x) for x in stopwords]
    stopwords.append('quelqu') # Make stopwords consistent with tokenization
    return stopwords

# Import french stopwords
#with open('models/stopwords-fr.txt') as f:
 #   stopwords = f.read().splitlines()
stopwords = import_stopwords()
stopwords = [unidecode.unidecode(x) for x in stopwords]

## Baseline TF-IDF model with and without NER

In [19]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LinearSVC()

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

In [20]:
# Preprocessing + training
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)

Compare train and test scores with and without NER procedure.

In [23]:
# Compute predictions on train and train score
y_train_pred = tfidf_pipeline.predict(X_train)
tfidf_test_score = f1_score(y_train, y_train_pred, average='micro')

y_train_pred_ner = tfidf_pipeline.predict(X_train_ner)
tfidf_test_score_ner = f1_score(y_train, y_train_pred_ner, average='micro')

print('F1 score on train set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score with NER procedure on train set with TF-IDF :', 
      tfidf_test_score_ner.round(3))

F1 score on train set with TF-IDF : 0.995


In [24]:
# Compute predictions and test score
y_pred = tfidf_pipeline.predict(X_test)
tfidf_test_score = f1_score(y_test, y_pred, average='micro')

y_pred_ner = tfidf_pipeline.predict(X_test_ner)
tfidf_test_score = f1_score(y_test, y_pred_ner, average='micro')

print('F1 score on test set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score on test set with TF-IDF :', 
      tfidf_test_score.round(3))

F1 score on test set with TF-IDF : 0.532


## FastText pre-trained embeddings + averaging

In [0]:
# Use sklearn preprocessing pipeline to ensure results comparability
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('models/fasttext.fr.300.bin')

In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [0]:
# Compute predictions and test score
y_pred = fasttext_pipeline.predict(X_test)
fasttext_test_score = f1_score(y_test, y_pred, average='micro')
print('F1 score on test set with pre-trained FastText + averaging :',
      fasttext_test_score.round(2))

F1 score on test set with pre-trained FastText + averaging : 0.37


## CamemBERT

In [0]:
# Import Camembert model
camembert = CamembertModel.from_pretrained('models/camembert.v0')
camembert.eval()
CAMEMBERT_WV_SIZE = camembert.extract_features(camembert.encode('test string')).shape[2]

loading archive file models/camembert.v0
| dictionary: 32004 types


In [0]:
test = [camembert.encode(x) for x in X_train[:100]]
max([len(x) for x in test])

2334

In [0]:
def text_to_wv_camembert(text):
    """Compute average of CamemBERT's word vectors for a given text."""
    if text:
        sentences = text.split('.')
        sentences = [s for s in sentences if s]
        tokens_ind = camembert.encode(*sentences)
        text_features = camembert.extract_features(tokens_ind)
        text_vec = text_features.squeeze(0).detach().numpy().mean(axis=0)
    else:
        text_vec = np.zeros(CAMEMBERT_WV_SIZE)
    return text_vec

In [0]:
def preprocess_corpus_camembert(corpus):
    """"""
    corpus_prepro = []
    for i, text in enumerate(corpus):
        corpus_prepro.append(text_to_wv_camembert(text))
        print(i)
    return np.array(corpus_prepro)

In [0]:
camembert_pipeline = Pipeline([
                              ('camembert_average', TextToWV(preprocess_corpus_camembert)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
camembert_pipeline = camembert_pipeline.fit(X_train, y_train)

0
1
2


ValueError: ignored