In [39]:
# Sample code adapted from treetaggerwrapper documentation: https://treetaggerwrapper.readthedocs.io/en/latest/
import pprint   # For proper print of sequences.
import treetaggerwrapper
#1) build a TreeTagger wrapper:
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
 #2) tag your text.
tags = tagger.tag_text("C'est une très petite phrase à taguer")
 #3) use the tags list... (list of string output from TreeTagger).
pprint.pprint(tags)


["C'\tPRO:DEM\tce",
 'est\tVER:pres\têtre',
 'une\tDET:ART\tun',
 'très\tADV\ttrès',
 'petite\tADJ\tpetit',
 'phrase\tNOM\tphrase',
 'à\tPRP\tà',
 'taguer\tNOM\ttaguer']


In [3]:
# try on a file! Claudine for now.
tags = tagger.tag_file("../texts/txt/1902-Colette-Claudine a l’École.txt")
pprint.pprint(tags[0:10])

['CLAUDINE\tNOM\tCLAUDINE',
 'A\tPRP\tà',
 "L'\tDET:ART\tle",
 'ÉCOLE\tNOM\técole',
 'Je\tPRO:PER\tje',
 "m'\tPRO:PER\tme",
 'appelle\tVER:pres\tappeler',
 'Claudine\tNAM\tClaudine',
 ',\tPUN\t,',
 "j'\tPRO:PER\tje"]


In [25]:
# Convert to list of tuples for better handling
tags2 = treetaggerwrapper.make_tags(tags, exclude_nottags=True)
tags_with_unks = treetaggerwrapper.make_tags(tags)

print(len(tags2))
print(len(tags_with_unks)) #~150 unks here, need to investigate what they are


64734
64875


In [29]:
sorted_tags = sorted(tags2, key=lambda tag: tag.lemma)

In [44]:
# Now, count occurrences of each entry in bestiary in list of lemmas
from itertools import groupby

# All of the rows of Table 2
bestiaire = ["abeille", "aigle", "âne", "animal", "araignée", "boeuf", 
             "canard", "cerf", "chat", "cheval", "chèvre", "chien", 
             "chouette", "cochon", "coq", "cygne", "dragon", "écureuil",
             "éléphant", "fourmi", "gibier", "insecte", "lapin", "lièvre",
             "loup", "moineau", "mouche", "mouton", "oie", "oiseau", "ours", 
             "papillon", "perroquet", "pigeon", "poisson", "poule", "poulet",
             "rat", "renard", "rossignol", "serpent", "singe", "souris", 
             "tigre", "truite", "vache", "veau"]

counts = [(i, len(list(c))) for i, c  in groupby(sorted_tags, key=lambda tag: tag.lemma)]
bestiaire_counts = [(i, c) for i, c in counts if i in bestiaire]
print(bestiaire_counts)

[('abeille', 1), ('animal', 7), ('araignée', 2), ('chat', 8), ('cheval', 5), ('chien', 2), ('chèvre', 1), ('cochon', 2), ('lapin', 2), ('lièvre', 1), ('loup', 4), ('moineau', 1), ('mouche', 4), ('mouton', 2), ('oie', 2), ('oiseau', 6), ('ours', 2), ('papillon', 4), ('poule', 1), ('serpent', 2), ('singe', 2), ('souris', 1), ('vache', 1), ('veau', 1), ('âne', 1)]


In [46]:
# test the tagger's lemmatization for some sample alternative cases for animal spellings
animal_tags = tagger.tag_text("abeilles moineux chevaux travaux animale chatte chattes")
 #3) use the tags list... (list of string output from TreeTagger).
pprint.pprint(animal_tags)

# interesting examples: moineux (sparrows) is not caught, chatte/chattes has an either/or that needs to be handled.

['abeilles\tNOM\tabeille',
 'moineux\tNOM\tmoineux',
 'chevaux\tNOM\tcheval',
 'travaux\tNOM\ttravail',
 'animale\tADJ\tanimal',
 'chatte\tNOM\tchat|chatte',
 'chevelles\tADJ\tchevelles',
 'chattes\tNOM\tchat|chatte']


In [72]:
# lets actually make the table now
# One author that we have all of the same texts in the corpus as in Brunet's paper is Zola.
# We'll try to make the Zola column

import glob
zola_files = [f for f in glob.glob("../texts/txt/*Zola*.txt")]
print('# of files: ' + str(len(zola_files)) + '\n filenames:' +  str(zola_files))
# we're missing L'assommoir

# of files: 6
 filenames:['../texts/txt/1883-Zola-AU BONHEUR DES DAMES.txt', '../texts/txt/1874-Zola-LA CONQUETE DE PLASSANS.txt', "../texts/txt/1891-Zola-L'argent.txt", '../texts/txt/1892-Zola-LE DEBACLE.txt', '../texts/txt/1890-Zola-LA BETE HUMANINE.txt', '../texts/txt/1880-Zola-NANA.txt']


In [None]:
# this step takes ~30 seconds
text_tags = [treetaggerwrapper.make_tags(tagger.tag_file(f), exclude_nottags=True) for f in zola_files]

combined_tags = [item for sublist in text_tags for item in sublist]
sorted_tags = sorted(combined_tags, key=lambda tag: tag.lemma)
counts = [(i, len(list(c))) for i, c  in groupby(sorted_tags, key=lambda tag: tag.lemma)]
bestiaire_counts = [(i, c) for i, c in counts if i in bestiaire]
total = sum([c for i, c in bestiaire_counts])
bestiaire_counts.append(('total', total))
print(bestiaire_counts)