In [None]:
import re
import pickle
import gensim
import spacy

In [None]:
discourse = 'RussianTrolls'
# There are five options for vector spaces of words, which represent
# different discourses, or the ways in which language is used: Flaubert,
# Sand, Balzac, Trump, Russian Trolls.
# See below.

In [None]:
assertion = u"No one is born hating another person because of the " + \
    u"color of his skin or his background or his religion. People " + \
    u"must learn to hate, and if they can learn to hate, they can " + \
    u"be taught to love. For love comes more naturally to the " + \
    u"human heart than its opposite."
# The assertion, a tweet by Barack Obama posted August 12, 2017
# quoting Nelson Mandela, will be altered by word substitutions
# based on the analogy below.

positive = [u'white', u'charlottesville']
negative = [u'minority']
# These words establish the analogy for finding similar words in
# the vector space.

In [None]:
params = {
    'Flaubert':
        ['NCF_short_author_Flaubert_model',
         # vector space of words from 30 volumes by Flaubert
         
         'NCF_pos_dict.pkl',
        # a dictionary of all words in the vector space with
         # part-of-speech (POS) tags

         'fr',
         # the language of the vector space
         
         ('DET', 'PUNCT')
         # POS tags for words that will not be replaced in asserted
         # text
        ],
    'Balzac':
        ['NCF_short_author_Balzac_model',
         # vector space of words from 118 volumes by Balzac
         
         'NCF_pos_dict.pkl',
         
         'fr',
         
         ('DET', 'PUNCT')
        ],
    'Sand':
        ['NCF_short_author_Sand_model',
         # vector space of words from 70 volumes by Sand
         
         'NCF_pos_dict.pkl',
         
         'fr',
         
         ('DET', 'PUNCT')
        ],
    'Trump':
        ['Trump_model',
         # a vector space of words from all of Trump's tweets
         
         'Trump_pos_dict.pkl',
         
         'en',
         
         ('DT', 'PUNCT', 'IN')
        ],
    'RussianTrolls':
        ['Russians_model',
         # a vector space of words from the Russian Troll tweets
         # shared by fivethirtyeight
         
         'Russians_pos_dict.pkl',
         
         'en',
         
         ('DT', 'PUNCT', 'IN')
        ]
}

number_of_options = 15
# the max number of similar words proposed from the vector space
# for each word in the asserted text.

In [None]:
model = gensim.models.Word2Vec.load(params[discourse][0])
pickleFile = open(params[discourse][1], 'rb')
posd = pickle.load(pickleFile)

nlp = spacy.load(params[discourse][2])
parsed = nlp(assertion)
words = [(w.text.lower(), w.tag_, w.lemma_.lower()) for w in parsed]
# Build a list of 3-tuples for each word in the asserted text:
# (the word in the asserted text, its POS, its lemma)

new_words = []

for word in words:
    try:
        hits = []
        # a list of vector space words to be built that will be similar
        # to a word in the asserted text.
        
        psw = word[1].split('__')[0]
        # The POS tag for a word in the asserted text.
        
        #print word[0], word[1], word[2] # for debugging
        
        for item in model.wv.most_similar(positive=positive + [word[2]],
                                          negative=negative,
                                          topn=number_of_options):
        # Take each word in the asserted text and look for similar words
        # in the vector space based on the analogy.
        
            #print '\t', item # for debugging
            
            if posd[item[0]]:
            # does the vector-space word have a POS tag?
            
                psd = next(iter(posd[item[0]])).split('__')[0]
                
                #print '\t\t', psd # for debugging
                
                if (psw not in params[discourse][3]) and (psw == psd):
                # We exclude certain POS words (like determiners and
                # punctuation: see above) to maintain readability in
                # the invented text. We also select words from the
                # vector space that are the same POS as the original
                # word in the asserted text.
                
                    hits.append(item[0])
                    
        if len(hits) > 0:
        # Did we find at least one vector space word with the same POS?
        # If so, display them in parentheses in the invented text.
        
            replacement = '(' + '|'.join(hits) + ')'
            new_words.append(replacement)
            
        else:
        # If we found nothing that matches, use the original word.
        
            new_words.append(word[0])
    except:
    # If something weird happens, just use the original word.
    
        new_words.append(word[0])
        
        #print 'EXCEPTION', word[0] # for debugging

response = ' '.join(new_words)

In [None]:
print assertion, '\n'
print response

In [None]:
print model.wv.most_similar(positive=[u'woman', u'father'],
                            negative=[u'man'], topn=15)

In [None]:
print model.wv.most_similar(positive=positive + [u'charlottesville'],
                            negative=negative, topn=15)

In [None]:
print model.wv.most_similar(positive=positive,
                            negative=negative + [u'charlottesville'], topn=15)