<a href="https://colab.research.google.com/github/meteve/NLP_project/blob/master/scripts/who_wrote_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Who wrote this : a framework for French novelist identification

In [1]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix

## Data preprocessing

In [2]:
# Import train data
train_df = pd.read_csv('../data/corpus_train_features_NER.csv', index_col=0)
X_train = train_df['paragraph'].values
X_train_ner = train_df['paragraph_ner'].values

y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)

## Baseline TF-IDF model with and without NER

In [3]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LinearSVC()

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

In [4]:
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)
tfidf_pipeline_ner = tfidf_pipeline.fit(X_train_ner, y_train)

Compare train and test scores with and without NER procedure.

In [5]:
# Compute predictions on train and train score
y_train_pred = tfidf_pipeline.predict(X_train)
y_train_pred_ner = tfidf_pipeline_ner.predict(X_train_ner)

tfidf_test_score = f1_score(y_train, y_train_pred, average='micro')
tfidf_test_score_ner = f1_score(y_train, y_train_pred_ner, average='micro')

print('F1 score on train set with TF-IDF :', 
      tfidf_test_score.round(3))
print('F1 score on train set with TF-IDF with NER:', 
      tfidf_test_score_ner.round(3))

F1 score on train set with TF-IDF : 0.972
F1 score on train set with TF-IDF with NER: 0.973


Confusion matrix

In [6]:
def get_conf_matrix_df(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    conf_matrix = pd.DataFrame(conf_matrix, columns=list(le.classes_), index=list(le.classes_))
    
    row_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[0:10,i]), 3)
        row_percentages.append(perc)
    
    row_percentages = pd.DataFrame([row_percentages], columns=list(le.classes_))
    
    col_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[i,0:10]), 3)
        col_percentages.append(perc)
    col_percentages.append(None)
        
    
    conf_matrix = conf_matrix.append(row_percentages)
    conf_matrix = conf_matrix.rename(index={0: 'PRECISION'})
    conf_matrix['RECALL'] = col_percentages

    
    return(conf_matrix)

In [7]:
conf_matrix_train = get_conf_matrix_df(y_train, y_train_pred)

In [8]:
conf_matrix_train

Unnamed: 0,Balzac,Daudet,Dumas,Flaubert,Hugo,Maupassant,Stendhal,Verne,Vigny,Zola,RECALL
Balzac,2972.0,0.0,74.0,2.0,11.0,2.0,10.0,1.0,4.0,3.0,0.965
Daudet,5.0,1637.0,30.0,2.0,9.0,1.0,2.0,0.0,1.0,4.0,0.968
Dumas,21.0,1.0,13990.0,11.0,51.0,9.0,48.0,12.0,21.0,8.0,0.987
Flaubert,4.0,1.0,59.0,3472.0,19.0,10.0,10.0,0.0,1.0,3.0,0.97
Hugo,12.0,0.0,117.0,2.0,6169.0,16.0,8.0,4.0,1.0,8.0,0.973
Maupassant,4.0,4.0,70.0,5.0,16.0,3347.0,17.0,2.0,3.0,13.0,0.962
Stendhal,8.0,2.0,96.0,6.0,12.0,7.0,5817.0,2.0,3.0,3.0,0.977
Verne,0.0,1.0,38.0,0.0,4.0,3.0,1.0,4114.0,0.0,1.0,0.988
Vigny,3.0,1.0,296.0,0.0,14.0,3.0,13.0,0.0,1613.0,2.0,0.829
Zola,3.0,1.0,36.0,6.0,6.0,6.0,8.0,2.0,0.0,2732.0,0.976


### TF-IDF weights

In [9]:
def top_tfidf_feats(row, features, top_n):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [62]:
def get_tfidf_weights_by_author(author, NER=False, top_n=20):
    '''Return words with high tfidf weights, for well-classified paragraphs of an author with or without NER'''
    y_true = y_train
    
    if NER == True:
        y_pred = y_train_pred_ner
        pipe = tfidf_pipeline_ner
        X = X_train_ner
    else:
        y_pred = y_train_pred
        pipe = tfidf_pipeline
        X = X_train
        
    
    if type(author) == str:
        author_index = list(le.classes_).index(author)
    elif type(author) == int:
        author_index = author
   
    # keep well classified paragraphs of the auhtor
    well_classif_indexes = ((y_true == author_index) & (y_pred == author_index))
    X_train_well_classif = X[well_classif_indexes]
    
    # get tf-idf vectors
    vec = pipe.named_steps['tf-idf']
    Xtr = vec.transform(X_train_well_classif)
    features = vec.get_feature_names()
    
    # get mean tf-idf scores
    D = Xtr.toarray()
    tfidf_means = np.mean(D, axis=0)
    
    return top_tfidf_feats(tfidf_means, features, top_n)

In [76]:
get_tfidf_weights_by_author(author='Dumas')

Unnamed: 0,feature,tfidf
0,de,0.062312
1,vous,0.05171
2,et,0.043479
3,le,0.043447
4,la,0.042506
5,il,0.039407
6,que,0.037811
7,artagnan,0.035046
8,je,0.034812
9,un,0.029885


In [77]:
get_tfidf_weights_by_author(author='Dumas', NER=True)

Unnamed: 0,feature,tfidf
0,de,0.062893
1,vous,0.054167
2,et,0.046639
3,le,0.04605
4,la,0.043613
5,il,0.042252
6,que,0.040842
7,je,0.036286
8,un,0.031745
9,en,0.031543


In [16]:
list(le.classes_)

['Balzac',
 'Daudet',
 'Dumas',
 'Flaubert',
 'Hugo',
 'Maupassant',
 'Stendhal',
 'Verne',
 'Vigny',
 'Zola']

In [61]:
train_df['paragraph'][train_df['author'] == 'Daudet'].iloc[100]

"--Mais oui, maîtresse... Tenez! juste au-dessus de nous, voilà le _Chemin de saint Jacques_ (la voie lactée). Il va de France droit sur l'Espagne. C'est saint Jacques de Galice qui l'a tracé pour montrer sa route au brave Charlemagne lorsqu'il faisait la guerre aux Sarrasins[2]. Plus loin, vous avez le _Char des âmes_ (la grande Ourse) avec ses quatre essieux resplendissants. Les trois étoiles qui vont devant sont les _Trois bêtes_, et cette toute petite contre la troisième c'est le _Charretier_. Voyez-vous tout autour cette pluie d'étoiles qui tombent? ce sont les âmes dont le bon Dieu ne veut pas chez lui... Un peu plus bas, voici le _Râteau_ ou les _Trois rois_ (Orion). C'est ce qui nous sert d'horloge, à nous autres. Rien qu'en les regardant, je sais maintenant qu'il est minuit passé. Un peu plus bas, toujours vers le midi, brille _Jean de Milan_, le flambeau des astres (Sirius). Sur cette étoile-là, voici ce que les bergers racontent. Il paraît qu'une nuit _Jean de Milan_, avec l