<a href="https://colab.research.google.com/github/meteve/NLP_project/blob/master/scripts/who_wrote_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Who wrote this : a framework for French novelist identification

In [1]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix

## Data preprocessing

In [2]:
# Import train data
train_df = pd.read_csv('../data/corpus_train_features_NER.csv', index_col=0)
X_train_ner = train_df['paragraph_ner'].values
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)

In [3]:
# Import test data
test_df = pd.read_csv('../data/corpus_test_NER.csv', index_col=0)
X_test_ner = test_df['paragraph_ner'].values
X_test = test_df['paragraph'].values
y_labels_test = test_df['author'].values
y_test = le.transform(y_labels_test)

In [4]:
def import_stopwords():
    """Download and import French stopwords."""
    URL = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt'
    response = urllib.request.urlopen(URL)
    stopwords = response.read().decode('utf-8').splitlines()
    stopwords = [unidecode.unidecode(x) for x in stopwords]
    stopwords.append('quelqu') # Make stopwords consistent with tokenization
    return stopwords

# Import french stopwords
#with open('models/stopwords-fr.txt') as f:
 #   stopwords = f.read().splitlines()
stopwords = import_stopwords()
stopwords = [unidecode.unidecode(x) for x in stopwords]

## Baseline TF-IDF model with and without NER

In [5]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LinearSVC()

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

In [6]:
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)
tfidf_pipeline_ner = tfidf_pipeline.fit(X_train_ner, y_train)

Compare train and test scores with and without NER procedure.

In [7]:
# Compute predictions on train and train score
y_train_pred = tfidf_pipeline.predict(X_train)
tfidf_test_score = f1_score(y_train, y_train_pred, average='micro')

y_train_pred_ner = tfidf_pipeline_ner.predict(X_train_ner)
tfidf_test_score_ner = f1_score(y_train, y_train_pred_ner, average='micro')

print('F1 score on train set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score with NER procedure on train set with TF-IDF :', 
      tfidf_test_score_ner.round(3))

F1 score on train set with TF-IDF : 0.972
F1 score with NER procedure on train set with TF-IDF : 0.973


In [8]:
# Compute predictions and test score
y_pred = tfidf_pipeline.predict(X_test)
tfidf_test_score = f1_score(y_test, y_pred, average='micro')

y_pred_ner = tfidf_pipeline_ner.predict(X_test_ner)
tfidf_test_score_ner = f1_score(y_test, y_pred_ner, average='micro')

print('F1 score on test set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score with NER procedure on test set with TF-IDF :', 
      tfidf_test_score_ner.round(3))

F1 score on test set with TF-IDF : 0.501
F1 score with NER procedure on test set with TF-IDF : 0.487


## FastText pre-trained embeddings + averaging

In [0]:
# Use sklearn preprocessing pipeline to ensure results comparability
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('models/fasttext.fr.300.bin')

In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [0]:
# Compute predictions and test score
y_pred = fasttext_pipeline.predict(X_test)
fasttext_test_score = f1_score(y_test, y_pred, average='micro')
print('F1 score on test set with pre-trained FastText + averaging :',
      fasttext_test_score.round(2))

F1 score on test set with pre-trained FastText + averaging : 0.37
