<a href="https://colab.research.google.com/github/meteve/NLP_project/blob/master/scripts/who_wrote_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Who wrote this : a framework for French novelist identification

In [1]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix

## Data preprocessing

In [2]:
# Import train data
train_df = pd.read_csv('../data/corpus_train_features_NER.csv', index_col=0)
X_train_ner = train_df['paragraph_ner'].values
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)

In [3]:
# Import test data
test_df = pd.read_csv('../data/corpus_test_NER.csv', index_col=0)
X_test_ner = test_df['paragraph_ner'].values
X_test = test_df['paragraph'].values
y_labels_test = test_df['author'].values
y_test = le.transform(y_labels_test)

In [4]:
def import_stopwords():
    """Download and import French stopwords."""
    URL = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt'
    response = urllib.request.urlopen(URL)
    stopwords = response.read().decode('utf-8').splitlines()
    stopwords = [unidecode.unidecode(x) for x in stopwords]
    stopwords.append('quelqu') # Make stopwords consistent with tokenization
    return stopwords

# Import french stopwords
#with open('models/stopwords-fr.txt') as f:
 #   stopwords = f.read().splitlines()
stopwords = import_stopwords()
stopwords = [unidecode.unidecode(x) for x in stopwords]

## Baseline TF-IDF model with and without NER

In [5]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LinearSVC()

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

Compare train and test scores with and without NER procedure.

In [6]:
# Compute predictions on train and train score
y_train_pred = tfidf_pipeline.predict(X_train)
tfidf_test_score = f1_score(y_train, y_train_pred, average='micro')

y_train_pred_ner = tfidf_pipeline_ner.predict(X_train_ner)
tfidf_test_score_ner = f1_score(y_train, y_train_pred_ner, average='micro')

print('F1 score on train set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score with NER procedure on train set with TF-IDF :', 
      tfidf_test_score_ner.round(3))

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [None]:
# Compute predictions and test score
y_pred = tfidf_pipeline.predict(X_test)
tfidf_test_score = f1_score(y_test, y_pred, average='micro')

y_pred_ner = tfidf_pipeline_ner.predict(X_test_ner)
tfidf_test_score_ner = f1_score(y_test, y_pred_ner, average='micro')

print('F1 score on test set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score with NER procedure on test set with TF-IDF :', 
      tfidf_test_score_ner.round(3))

Confusion matrix

In [8]:
def get_conf_matrix_df(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    conf_matrix = pd.DataFrame(conf_matrix, columns=list(le.classes_), index=list(le.classes_))
    
    row_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[0:10,i]), 3)
        row_percentages.append(perc)
    
    row_percentages = pd.DataFrame([row_percentages], columns=list(le.classes_))
    
    col_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[i,0:10]), 3)
        col_percentages.append(perc)
        
    
    conf_matrix['recall'] = col_percentages
    conf_matrix = conf_matrix.append(row_percentages)
    
    return(conf_matrix)

In [9]:
conf_matrix_train = get_conf_matrix_df(y_train, y_train_pred)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [10]:
conf_matrix_test = get_conf_matrix_df(y_test, y_pred)

In [11]:
conf_matrix_test

Unnamed: 0,Balzac,Daudet,Dumas,Flaubert,Hugo,Maupassant,Stendhal,Verne,Vigny,Zola,col_percentages
Balzac,730.0,29.0,409.0,116.0,129.0,101.0,546.0,39.0,33.0,53.0,0.334
Daudet,21.0,79.0,170.0,91.0,88.0,72.0,55.0,70.0,17.0,52.0,0.11
Dumas,99.0,43.0,3458.0,108.0,313.0,152.0,230.0,84.0,38.0,48.0,0.756
Flaubert,26.0,8.0,155.0,624.0,125.0,87.0,96.0,21.0,11.0,58.0,0.515
Hugo,40.0,47.0,688.0,120.0,1521.0,107.0,309.0,146.0,42.0,32.0,0.498
Maupassant,18.0,18.0,126.0,50.0,75.0,211.0,36.0,33.0,9.0,42.0,0.341
Stendhal,27.0,9.0,208.0,33.0,39.0,29.0,897.0,22.0,21.0,13.0,0.691
Verne,12.0,2.0,181.0,39.0,56.0,24.0,52.0,421.0,16.0,12.0,0.517
Vigny,18.0,75.0,159.0,17.0,73.0,28.0,63.0,35.0,85.0,14.0,0.15
Zola,54.0,56.0,346.0,118.0,386.0,173.0,147.0,166.0,57.0,508.0,0.253


### TF-IDF weights

In [5]:
#vec_pipe = Pipeline([('vec', tfidf_vecto)])

In [7]:
vec = tfidf_pipeline.named_steps['tf-idf']
Xtr = vec.fit_transform(X_train)
features = vec.get_feature_names()

The functions below show the n highest scoring words of a paragraph.

In [7]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [8]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [9]:
top_feats_in_doc(Xtr, features, 7, top_n=10)

Unnamed: 0,feature,tfidf
0,commentaire,0.286708
1,compère,0.273435
2,rasoir,0.267134
3,oblique,0.243613
4,feignit,0.242411
5,abaissa,0.239085
6,grisette,0.237067
7,profil,0.220577
8,suzanne,0.2002
9,posa,0.191618


The function below computes the average tf-idf score of all words across a number of documents (in this case all documents), i.e. the average per column of a tf-idf matrix.

In [15]:
def top_mean_feats(author, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    Xtr = train_df['paragraph'][train_df['author']==author].values
    Xtr = vec_pipe.fit_transform(Xtr)
    vec = vec_pipe.named_steps['vec']
    features = vec.get_feature_names()
    
    D = Xtr.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [25]:
top_mean_feats(author='Zola')

Unnamed: 0,feature,tfidf
0,de,0.07131
1,elle,0.047834
2,la,0.043453
3,il,0.034398
4,les,0.030058
5,le,0.027007
6,vous,0.024999
7,je,0.024703
8,est,0.020507
9,des,0.020106


## FastText pre-trained embeddings + averaging

In [0]:
# Use sklearn preprocessing pipeline to ensure results comparability
preprocessor = tfidf_vecto.build_analyzer()

In [0]:
# Import Fasttext French word vectors
fasttext = FastText.load_fasttext_format('models/fasttext.fr.300.bin')

In [0]:
def text_to_wv_fasttext(text):
    """Compute average of FastText's word vectors for a given text."""
    if text:
        tokens = preprocessor(text)
        wv_mat = np.zeros((len(tokens), fasttext.vector_size))
        for i, tok in enumerate(tokens):
            try:
                wv_mat[i] = fasttext.wv[tok]
            except KeyError:
                pass
        text_vec = wv_mat.mean(axis=0)
    else:
        text_vec = np.zeros(fasttext.vector_size)
    return text_vec

In [0]:
def preprocess_corpus_fasttext(corpus):
    """Parallelize preprocessing and document vectors computation."""
    with Pool(N_CORES) as p:
        corpus_prepro = p.map(text_to_wv_fasttext, list(corpus))
    return np.array(corpus_prepro)

In [0]:
class TextToWV(BaseEstimator, TransformerMixin):
    """Enable to use preprocessing function in a sklearn pipeline."""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return(self)

    def transform(self, X):
        return self.preprocessor(X)

In [0]:
fasttext_pipeline = Pipeline([
                              ('fasttext_average', TextToWV(preprocess_corpus_fasttext)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
fasttext_pipeline = fasttext_pipeline.fit(X_train, y_train)

In [0]:
# Compute predictions and test score
y_pred = fasttext_pipeline.predict(X_test)
fasttext_test_score = f1_score(y_test, y_pred, average='micro')
print('F1 score on test set with pre-trained FastText + averaging :',
      fasttext_test_score.round(2))

F1 score on test set with pre-trained FastText + averaging : 0.37


## CamemBERT

In [0]:
# Import Camembert model
camembert = CamembertModel.from_pretrained('models/camembert.v0')
camembert.eval()
CAMEMBERT_WV_SIZE = camembert.extract_features(camembert.encode('test string')).shape[2]

loading archive file models/camembert.v0
| dictionary: 32004 types


In [0]:
test = [camembert.encode(x) for x in X_train[:100]]
max([len(x) for x in test])

2334

In [0]:
def text_to_wv_camembert(text):
    """Compute average of CamemBERT's word vectors for a given text."""
    if text:
        sentences = text.split('.')
        sentences = [s for s in sentences if s]
        tokens_ind = camembert.encode(*sentences)
        text_features = camembert.extract_features(tokens_ind)
        text_vec = text_features.squeeze(0).detach().numpy().mean(axis=0)
    else:
        text_vec = np.zeros(CAMEMBERT_WV_SIZE)
    return text_vec

In [0]:
def preprocess_corpus_camembert(corpus):
    """"""
    corpus_prepro = []
    for i, text in enumerate(corpus):
        corpus_prepro.append(text_to_wv_camembert(text))
        print(i)
    return np.array(corpus_prepro)

In [0]:
camembert_pipeline = Pipeline([
                              ('camembert_average', TextToWV(preprocess_corpus_camembert)),
                              ('SVC', clf)
])

In [0]:
# Preprocessing + training
camembert_pipeline = camembert_pipeline.fit(X_train, y_train)

0
1
2


ValueError: ignored