In [4]:
import treetaggerwrapper    # https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/
import glob
import pickle
import os
import argparse

In [5]:
# Motivation:
# Brunet did not have lemmatization/word-sense-disambiguation libraries available to him; nonetheless, he 
# evidentally performed lemmatization in his analysis. This is clear from the tables he created to display
# word counts, for example, 'chien' and 'chiens' are not listed separately. 

# Brunet did not explicitly state his lemmatization methodology, so in replication we are left we the task of choosing
# our own. Automated lemmatization confers clear advantages in time and effort spent to produce coherent results.

# 100% accuracy in this task is impossible, and likely not a desirable goal, considering we are not sure to what accuracy
# Brunet lemmatized his work. Independent of the sophistication of the methods used for lemmatization 
# (it may be possible to be 'too' accurate compared to Brunet), 
# lemmatization should be considered one of many sources of departure from Brunet in this replication task.

# Considering each text as a list of words, this function generates a list of tuples with Part of Speech and lemmas
# corresponding to each word, such that lemmas can be directly compared and disambiguated with reference to their part
# of speech if desired.
#
#
#
# Method Description:
# Use TreeTaggerWrapper tool function to transform set of .txt documents 
# into a set of documents containing a list of tuples corresponding to the list of words in the original document.
# Each tuple contains the original word, the Part-of-Speech tag, and the lemma of that word.
# 
# Argument details:
# exclude_nottags (bool) – dont generate NotTag for wrong size outputs. Default to False. (Set to True, no need to
# generate noisy data)
#


def lemmatize_input_files(source_dir):
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    files = [f for f in glob.glob(source_dir + "*.txt")]
    return {os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f), exclude_nottags=True) for f in files}

In [24]:
# This is a utility function: rather than re-lemmatize the input text with every session, create a database of
# lemmatized files that can be re-used so long as the input data does not change.

def dump_data_to_files(output_dir, file_data_dict):
    os.makedirs(output_dir, exist_ok=True)
    
    for filename, tags in file_data_dict.items():
        file = open(output_dir + filename, "wb")
        pickle.dump(tags, file)
        file.close()

In [25]:
# This function will create a new database of lemmatize text files corresponding to every book in the corpus
# You will need to run this every time you add/remove books from the corpus.

source_dir = "../texts/txt/"
output_dir = "../data"

file_data_dict = lemmatize_input_files(source_dir)
dump_data_to_files(output_dir, file_data_dict)

done


['1935-Maxence Van Der Meersch-INVASION 14.txt', '1802-Chateaubriand-RENE.txt', '1913-Proust-DU CÔTÉ DE CHEZ SWANN.txt', '1883-Zola-AU BONHEUR DES DAMES.txt', '1913-Valery Larbaud-A.O. Barnabooth.txt', '1891-Huysmans-La bas.txt', '1922-Martin Du Gard-LES THIBAULT.txt', '1916-Colette-LA PAIX CHEZ LES BATES.txt', '1894-Jules Renard-Poil de Carotte.txt', '1902-Colette-Claudine a l’École.txt', '1929-Colette-Sido.txt', '1883-Maupassant-Contes et Nouvelles.txt', '1929-Jean Giono-Un de Baumugnes.txt', '1904-Colette-Dialogues De Bêtes.txt', '1932-Celine-VOYAGE AU BOUT DE LA NUIT .txt', '1807-Chateaubriand-Les Aventures du dernier Abencerage.txt', '1801-Chateaubriand-ATALA.txt', '1922-Colette-LA MAISON DE CLAUDINE.txt', '1874-Zola-LA CONQUETE DE PLASSANS.txt', '1928-Andre Breton-Nadja.txt', "1891-Zola-L'argent.txt", '1913-Pergaud-LE ROMAN DE MIRAUT.txt', '1955-Françoise Sagan-Bonjour tristesse.txt', "1949-Jean Paul Sartre-La mort dans l'ame.txt", '1910-Pergaud-DE GOUPIL À MARGOT.txt', '1892-Zol