In [6]:
# NLTK library: http://www.nltk.org/api/nltk.html
# note: nltk was throwing a runtime error on my machine until I ran the following commands in a local terminal: 
# import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pandas as pd

In [7]:
# Simple example: take a string, tokenize into a list of words, return a map of word -> frequency. 
sample = "C'est un exemple de phrase, le mot exemple vais apparaître deux fois dans la distribution de fréquence"
fdist = FreqDist()
for word in word_tokenize(sample):
    fdist[word.lower()] += 1
fdist

FreqDist({'exemple': 2, 'de': 2, "c'est": 1, 'un': 1, 'phrase': 1, ',': 1, 'le': 1, 'mot': 1, 'vais': 1, 'apparaître': 1, ...})

In [8]:
# Simple test: can we get the same number of results as Brunet?
with open("../texts/txt/1902-Colette-Claudine a l’École.txt", "r") as myfile:
    data=myfile.read().replace("\n", "")
fdist_claudine = FreqDist()
for word in word_tokenize(data):
    fdist_claudine[word.lower()] += 1
print(fdist_claudine["oiseau"]) # only one occurence; of course, there is morphology to consider
print(fdist_claudine["d'oiseau"]) 
print(fdist_claudine["oiseaux"]) # only 6/11 have been found (see pg. 306 of Brunet)
print(fdist_claudine["d'oiseaux"])

0
0
0
0


In [15]:
# Lets try to see if we can recapitulate the "z" score calculated in 
# table 1-1 for Claudine d'Ecole
bestiaire = ["abeille", "âne", "animal", "araignée", "boeuf", "chat", "cheval", "chèvre", "chien", "chouette", "cochon", "dragon", "fourmi", "gibier", "insecte", "lapin", "lièvre", "loup", "mouche", "mouton", "oie", "oiseau", "ours", "papillon", "poisson", "poule", "poulet", "rat", "renard", "rossignol", "serpent", "singe", "souris", "tigre", "vache", "veau"]

z_list = []
for animal in bestiaire:
    f = fdist_claudine[animal]
    p = f/fdist_claudine.N()
    o = f
    q = 1 - p
    if(p != 0 and f != 0 and q != 0):
        z_list.append({(o - (f*p))/(f*p*q)**0.5, animal})

z_list

[{245.6033387395212, 'animal'},
 {245.5992670998837, 'chat'},
 {245.60537453402765, 'cheval'},
 {245.60944607241802, 'chèvre'},
 {245.60741031165978, 'chien'},
 {245.60741031165978, 'cochon'},
 {245.60944607241802, 'lapin'},
 {245.60944607241802, 'lièvre'},
 {245.60537453402765, 'loup'},
 {245.60944607241802, 'mouche'},
 {245.60944607241802, 'mouton'},
 {245.60741031165978, 'oie'},
 {245.60944607241802, 'oiseau'},
 {245.60944607241802, 'ours'},
 {245.60741031165978, 'poisson'},
 {245.60944607241802, 'poule'},
 {245.60741031165978, 'singe'},
 {245.60741031165978, 'souris'},
 {245.60944607241802, 'veau'}]