In [1]:
import re
import gensim
import spacy
import pickle

In [2]:
discourse = 'Trump'
# There are four options for vector spaces of words, which represent
# different discourses, or the ways in which language is used: Trump,
# Balzac, Sand, Flaubert.
# See below.

number_of_options = 15
# the max number of similar words proposed from the vector space
# for each word in the asserted text.

In [3]:
positive = u'strong'
negative = u'weak'
# These two words establish the analogy for finding similar words in
# the vector space.

assertion = u"There are few issues more important to the security " + \
            u"of the US than the potential spread of nuclear " + \
            u"weapons or the potential for even more destructive " + \
            u"war in the Middle East. Today’s decision to put the " + \
            u"JCPOA at risk is a serious mistake."
# This assertion, a tweet by Barack Obama on 8 May 2018, will be altered by word
# substitutions based on the above analogy.

In [4]:
params = {
    'Trump':
        ['Trump_model',
         # a vector space of words from all of Trump's tweets
         
         'Trump_pos_dict.pkl',
         # a dictionary of all words in the vector space with part-of-speech (POS) tags
         
         'en',
         # the language of the vector space
         
         ('DT', 'PUNCT', 'IN')
         # POS tags for words that will not be replaced in asserted text
         
        ],
    'Balzac':
        ['NCF_short_author_Balzac_model',
         # vector space of words from 118 volumes by Balzac
         
         'NCF_pos_dict.pkl',
         # this dictionary is based on a corpus of 1,333 volumes of 19C French texts
         
         'fr',
         ('DET', 'PUNCT')
        ],
    'Sand':
        ['NCF_short_author_Sand_model',
         # vector space of words from 70 volumes by Sand
         
         'NCF_pos_dict.pkl',
         'fr',
         ('DET', 'PUNCT')
        ],
    'Flaubert':
        ['NCF_short_author_Flaubert_model',
         # vector space of words from 30 volumes by Flaubert
         
         'NCF_pos_dict.pkl',
         'fr',
         ('DET', 'PUNCT')
        ]
}

model = gensim.models.Word2Vec.load(params[discourse][0])
pickleFile = open(params[discourse][1], 'rb')
posd = pickle.load(pickleFile)

In [5]:
nlp = spacy.load(params[discourse][2])
parsed = nlp(assertion)
words = [(w.text.lower(), w.tag_, w.lemma_.lower()) for w in parsed]
# Build a list of 3-tuples for each word in the asserted text:
# (the word in the asserted text, its POS, its lemma)

new_words = []

for word in words:
    try:
        hits = []
        # a list of vector space words to be built that will be similar to a word
        # in the asserted text.
        
        psw = word[1].split('__')[0]
        # The POS tag for a word in the asserted text.
        
        #print word[0], word[1], word[2] # for debugging
        
        for item in model.wv.most_similar(positive=[positive.lower(), word[2]],
                                          negative=[negative.lower()],
                                          topn=number_of_options):
        # Take each word in the asserted text and look for similar words
        # in the vector space based on the analogy.
        
            #print '\t', item # for debugging
            
            if posd[item[0]]:
            # does the vector-space word have a POS tag?
            
                psd = next(iter(posd[item[0]])).split('__')[0]
                
                #print '\t\t', psd # for debugging
                
                if (psw not in params[discourse][3]) and (psw == psd):
                # We exclude certain POS words (like determiners and punctuation: see above)
                # to maintain readability in the invented text.
                # We also select words from the vector space that are the same POS
                # as the original word in the asserted text.
                
                    hits.append(item[0])
                    
        if len(hits) > 0:
        # Did we find at least one vector space word with the same POS?
        # If so, display them in parentheses in the invented text.
        
            replacement = '(' + '|'.join(hits) + ')'
            new_words.append(replacement)
            
        else:
        # If we found nothing that matches, use the original word.
        
            new_words.append(word[0])
    except:
    # If something weird happens, just use the original word.
    
        new_words.append(word[0])
        
        #print 'EXCEPTION', word[0] # for debugging

response = ' '.join(new_words)

In [6]:
print assertion, '\n'
print response

There are few issues more important to the security of the US than the potential spread of nuclear weapons or the potential for even more destructive war in the Middle East. Today’s decision to put the JCPOA at risk is a serious mistake. 

there (tell) (low) issues more (beautiful|low|happy) to the security of the (gop|maga|state) than the (low) spread of (low) (congratulation) or the potential for (then) more (exceptional) war in the (w|gop) (sunday|hotel|university) . (tomorrow|poll) ’s (gift) to put the jcpoa at (chairman|class|food) is a (important|low) (job|friend|hope) .


In [7]:
print model.wv.most_similar(positive=[positive, u'serious'],
                            negative=[negative], topn=15)

[(u'move', 0.9992275238037109), (u'important', 0.9991986155509949), (u'tax', 0.9991898536682129), (u'yet', 0.9991799592971802), (u'pay', 0.9991798400878906), (u'bill', 0.9991753101348877), (u'us', 0.9991708993911743), (u'low', 0.9991667866706848), (u'other', 0.9991663694381714), (u'justice', 0.9991630911827087), (u'cut', 0.9991627335548401), (u'release', 0.999162495136261), (u'put', 0.9991592168807983), (u'china', 0.9991582632064819), (u'major', 0.9991567730903625)]


In [8]:
print model.wv.most_similar(positive=[positive, u'mistake'],
                            negative=[negative], topn=15)

[(u'now', 0.9994233250617981), (u'to', 0.9994038939476013), (u'fight', 0.999396800994873), (u'proud', 0.9993846416473389), (u'job', 0.9993821978569031), (u'start', 0.9993764758110046), (u'tough', 0.9993470907211304), (u'will', 0.9993371963500977), (u'deal', 0.9993306994438171), (u'a', 0.9993232488632202), (u'once', 0.9993203282356262), (u'friend', 0.9993187785148621), (u'man', 0.999317467212677), (u'hope', 0.9993098974227905), (u'order', 0.99930739402771)]
