In [1]:
import re
import gensim
import spacy
import pickle

In [2]:
discourse = 'Flaubert'
# There are four options for vector spaces of words, which represent
# different discourses, or the ways in which language is used: Trump,
# Balzac, Sand, Flaubert.
# See below.

number_of_options = 15
# the max number of similar words proposed from the vector space
# for each word in the asserted text.

In [3]:
positive = u'bien'
negative = u'mal'
# These two words establish the analogy for finding similar words in
# the vector space.

assertion = u"Il faut être toujours ivre. Tout est là : " + \
    u"c'est l'unique question. Pour ne pas sentir l'horrible " + \
    u"fardeau du Temps qui brise vos épaules et vous penche " + \
    u"vers la terre, il faut vous enivrer sans trêve."
# The assertion, from Baudelaire's poem Enivrez-vous!, will be altered
# by word substitutions based on the above analogy.

In [4]:
params = {
    'Trump':
        ['Trump_model',
         # a vector space of words from all of Trump's tweets
         
         'Trump_pos_dict.pkl',
         # a dictionary of all words in the vector space with
         # part-of-speech (POS) tags
         
         'en',
         # the language of the vector space
         
         ('DT', 'PUNCT', 'IN')
         # POS tags for words that will not be replaced in asserted
         # text
         
        ],
    'Balzac':
        ['NCF_short_author_Balzac_model',
         # vector space of words from 118 volumes by Balzac
         
         'NCF_pos_dict.pkl',
         # this dictionary is based on a corpus of 1,333 volumes of
         # 19C French texts
         
         'fr',
         ('DET', 'PUNCT')
        ],
    'Sand':
        ['NCF_short_author_Sand_model',
         # vector space of words from 70 volumes by Sand
         
         'NCF_pos_dict.pkl',
         'fr',
         ('DET', 'PUNCT')
        ],
    'Flaubert':
        ['NCF_short_author_Flaubert_model',
         # vector space of words from 30 volumes by Flaubert
         
         'NCF_pos_dict.pkl',
         'fr',
         ('DET', 'PUNCT')
        ]
}

model = gensim.models.Word2Vec.load(params[discourse][0])
pickleFile = open(params[discourse][1], 'rb')
posd = pickle.load(pickleFile)

In [5]:
nlp = spacy.load(params[discourse][2])
parsed = nlp(assertion)
words = [(w.text.lower(), w.tag_, w.lemma_.lower()) for w in parsed]
# Build a list of 3-tuples for each word in the asserted text:
# (the word in the asserted text, its POS, its lemma)

new_words = []

for word in words:
    try:
        hits = []
        # a list of vector space words to be built that will be similar
        # to a word in the asserted text.
        
        psw = word[1].split('__')[0]
        # The POS tag for a word in the asserted text.
        
        #print word[0], word[1], word[2] # for debugging
        
        for item in model.wv.most_similar(positive=[positive.lower(),
                                                    word[2]],
                                          negative=[negative.lower()],
                                          topn=number_of_options):
        # Take each word in the asserted text and look for similar words
        # in the vector space based on the analogy.
        
            #print '\t', item # for debugging
            
            if posd[item[0]]:
            # does the vector-space word have a POS tag?
            
                psd = next(iter(posd[item[0]])).split('__')[0]
                
                #print '\t\t', psd # for debugging
                
                if (psw not in params[discourse][3]) and (psw == psd):
                # We exclude certain POS words (like determiners and
                # punctuation: see above) to maintain readability in
                # the invented text. We also select words from the
                # vector space that are the same POS as the original
                # word in the asserted text.
                
                    hits.append(item[0])
                    
        if len(hits) > 0:
        # Did we find at least one vector space word with the same POS?
        # If so, display them in parentheses in the invented text.
        
            replacement = '(' + '|'.join(hits) + ')'
            new_words.append(replacement)
            
        else:
        # If we found nothing that matches, use the original word.
        
            new_words.append(word[0])
    except:
    # If something weird happens, just use the original word.
    
        new_words.append(word[0])
        
        #print 'EXCEPTION', word[0] # for debugging

response = ' '.join(new_words)

In [6]:
print assertion, '\n'
print response

Il faut être toujours ivre. Tout est là : c'est l'unique question. Pour ne pas sentir l'horrible fardeau du Temps qui brise vos épaules et vous penche vers la terre, il faut vous enivrer sans trêve. 

(je|on|a) (espérer|désirer|décider) être (aussi|très|quelquefois|plutôt|néanmoins|complètement|bientôt|comment) (fougueux|insociable) . (seul|vingt-quatre|impossible) est là : c' est l' unique (histoire|phraser|article|science|oeuvrer|providence) . (chez) (peut-être|n|plutôt|davantage|certainement|comment) (jamais|nullement|plaire|plutôt|peut-être) (rendre|sembler|exprimer) l' (antithèse|inspiration|intimit) (idiome|tyrannie|fiction|déguisement|timidit) du (jour|partir|semaine|minuter|dimanche|mois|moment) (elles) (rouler|retomber|siffler|croiser) vos épaules (mai) vous penche (vers) la (intervalle|banc|muraille|remonter|ténèbres|fenêtre) , (je|on|a) (espérer|désirer|décider) vous (préalablement|panser|effaroucher) sans trêve .
