In [1]:
import numpy as np

In [2]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("~/Downloads/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", binary=True, unicode_errors="ignore")

In [3]:
import pandas as pd

In [4]:
dico = pd.read_csv("/home/olojkine/Downloads/lexique-grammalecte-fr-v7.0.txt", skiprows=15, sep='\t')
dico.sort_values("Fréquence", inplace=True)
dico = dico[(dico.Fréquence > 1e-5) & (dico.Étiquettes.str.match("nom|v[123]|adj")) & (dico.Lemme.str.len() > 2) & (dico.Flexion == dico.Lemme)]
dico = dico[dico.Lemme.isin(model.index_to_key)]
dico[["Lemme", "Fréquence"]]

Unnamed: 0,Lemme,Fréquence
91466,modénature,0.000010
91459,psychédélique,0.000010
91457,phonémique,0.000010
91456,papaye,0.000010
91444,félibre,0.000010
...,...,...
68,faire,0.142856
58,fait,0.184967
42,pas,0.275864
38,plus,0.281590


In [5]:
import requests
import time
def req_word(word):
    r = requests.post("https://cemantix.herokuapp.com/score", data={"word": word}).json()
    if "error" in r and "tapez trop vite" in r["error"]:
        time.sleep(.2)
        return req_word(word)
    return r.get("score", np.nan)

In [6]:
import datetime
now = datetime.datetime.now()
dt = datetime.datetime(now.year, now.month, now.day+1) - now
print(f"sleeping for {dt}")
time.sleep(dt.total_seconds())

sleeping for 0:02:16.457210


In [7]:
import time
t = time.time()
vocab = pd.Series(dico.Fréquence.values, index=dico.Lemme)
i = 0
while not vocab.empty:
    i += 1
    word = vocab.idxmax()
    print(f"{i:3d}. J'essaie “{word}”. ", end="")
    score = req_word(word)
    vocab.drop(index=word, inplace=True)
    if np.isnan(score):
        print(" Cémantix ne connaît pas ce mot.")
        continue
    if score == 1:
        print(f" GAGNÉ en {time.time() - t:.1f} secondes")
        break
    print(f"Score: {100*score:.2f}.")
    sims = vocab.index.map({
        word: sim
        for word, sim
        in model.most_similar(word, topn=len(model))
    })
    vocab /= np.abs(score-sims)

  1. J'essaie “est”. Score: 18.18.
  2. J'essaie “système”. Score: 17.48.
  3. J'essaie “parlement”. Score: -2.40.
  4. J'essaie “espèce”. Score: 6.17.
  5. J'essaie “usage”. Score: 18.01.
  6. J'essaie “mode”. Score: 21.26.
  7. J'essaie “moyen”. Score: 24.22.
  8. J'essaie “fonction”. Score: 22.32.
  9. J'essaie “égal”. Score: 4.35.
 10. J'essaie “utilitaire”. Score: 9.70.
 11. J'essaie “intermédiaire”. Score: 31.53.
 12. J'essaie “forme”. Score: 26.09.
 13. J'essaie “scriptural”. Score: 19.17.
 14. J'essaie “classique”. Score: 18.51.
 15. J'essaie “ponctuel”.  GAGNÉ en 11.7 secondes
