<a href="https://colab.research.google.com/github/meteve/NLP_project/blob/master/scripts/who_wrote_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Who wrote this : a framework for French novelist identification

In [2]:
import os
import re
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count

import unidecode
import urllib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix

## Data preprocessing

In [3]:
# Import train data
train_df = pd.read_csv('../data/corpus_train_features_NER.csv', index_col=0)
X_train = train_df['paragraph'].values
y_labels_train = train_df['author'].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_labels_train)

## Baseline TF-IDF model with and without NER

In [4]:
# ML pipeline : TF-IDF + SVM classifier

tfidf_vecto = TfidfVectorizer()
clf = LinearSVC()

tfidf_pipeline = Pipeline([
                           ('tf-idf', tfidf_vecto),
                           ('SVC', clf)
])

In [5]:
tfidf_pipeline = tfidf_pipeline.fit(X_train, y_train)

Compare train and test scores with and without NER procedure.

In [6]:
# Compute predictions on train and train score
y_train_pred = tfidf_pipeline.predict(X_train)
tfidf_test_score = f1_score(y_train, y_train_pred, average='micro')

print('F1 score on train set with TF-IDF :', 
      tfidf_test_score.round(3))

F1 score on train set with TF-IDF : 0.995


Confusion matrix

In [7]:
def get_conf_matrix_df(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    conf_matrix = pd.DataFrame(conf_matrix, columns=list(le.classes_), index=list(le.classes_))
    
    row_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[0:10,i]), 3)
        row_percentages.append(perc)
    
    row_percentages = pd.DataFrame([row_percentages], columns=list(le.classes_))
    
    col_percentages = []
    for i in range(0,10):
        perc = round(conf_matrix.iloc[i,i]/sum(conf_matrix.iloc[i,0:10]), 3)
        col_percentages.append(perc)
    col_percentages.append(None)
        
    
    conf_matrix = conf_matrix.append(row_percentages)
    conf_matrix = conf_matrix.rename(index={0: 'PRECISION'})
    conf_matrix['RECALL'] = col_percentages

    
    return(conf_matrix)

In [8]:
conf_matrix_train = get_conf_matrix_df(y_train, y_train_pred)

In [9]:
conf_matrix_train

Unnamed: 0,Balzac,Daudet,Dumas,Flaubert,Hugo,Maupassant,Stendhal,Verne,Vigny,Zola,RECALL
Balzac,3061.0,0.0,13.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.994
Daudet,1.0,1683.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.995
Dumas,2.0,1.0,14149.0,3.0,7.0,2.0,4.0,1.0,2.0,1.0,0.998
Flaubert,0.0,0.0,10.0,3563.0,2.0,2.0,2.0,0.0,0.0,0.0,0.996
Hugo,1.0,0.0,29.0,3.0,6297.0,3.0,1.0,1.0,0.0,2.0,0.994
Maupassant,1.0,0.0,21.0,3.0,8.0,3440.0,3.0,0.0,1.0,4.0,0.988
Stendhal,0.0,0.0,18.0,0.0,3.0,2.0,5931.0,1.0,0.0,1.0,0.996
Verne,0.0,1.0,5.0,0.0,2.0,0.0,0.0,4154.0,0.0,0.0,0.998
Vigny,1.0,1.0,52.0,0.0,1.0,0.0,2.0,0.0,1888.0,0.0,0.971
Zola,0.0,0.0,4.0,1.0,1.0,1.0,3.0,0.0,0.0,2790.0,0.996


### TF-IDF weights

In [63]:
def top_tfidf_feats(row, features, top_n):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [77]:
def get_tfidf_weights_by_author(author, y_true, y_pred, top_n=25):
    if type(author) == str:
        author_index = list(le.classes_).index(author)
    elif type(author) == int:
        author_index = author
   
    # keep well classified paragraphs of the auhtor
    well_classif_indexes = ((y_train == author_index) & (y_train_pred == author_index))
    X_train_well_classif = X_train[well_classif_indexes]
    
    # get tf-idf vectors
    vec = tfidf_pipeline.named_steps['tf-idf']
    Xtr = vec.transform(X_train_well_classif)
    features = vec.get_feature_names()
    
    # get mean tf-idf scores
    D = Xtr.toarray()
    #D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [78]:
get_tfidf_weights_by_author(author=2, y_true=y_train, y_pred=y_train_pred)

Unnamed: 0,feature,tfidf
0,de,0.062761
1,vous,0.052413
2,et,0.044151
3,le,0.044058
4,la,0.042893
5,il,0.040266
6,que,0.038498
7,je,0.035429
8,un,0.030408
9,en,0.029846
