In [90]:
%load_ext autoreload
%autoreload 2
from convergence import load_files as lf # contains functions to load vector files
import pickle 
from nltk import WordNetLemmatizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# What words will the robot understand?
To create the bot's vocabulary (words it can understand), I used word vectors from a pre-trained set of data consisting of <a href="https://catalog.ldc.upenn.edu/LDC2011T07">Gigawords</a> and <a href="https://dumps.wikimedia.org/backup-index.html">2014 Wikimedia dumps</a>.

In [20]:
model = lf.load_vectors('https://convergencerobot.s3-us-west-2.amazonaws.com/glove.6B.300d.txt')

Loading...
Done. 400000  words loaded!


In [25]:
#reshape the vectors
for key, value in model.items():
    model[key] = value.reshape(1,-1)
model['yo'].shape

(1, 300)

In [9]:
# pickle the model
# filename = 'pre_trained_model.pkl'
# pickle.dump(model, open(filename, 'wb'))

Now the GloVe model is loaded, and I show you the code above if you wanted to pickle it locally. There is a <a href="https://convergencerobot.s3-us-west-2.amazonaws.com/pre_trained_model.pkl"> saved version online</a>.

# What words will the robot say?

The robot will mathematically find the very exact avergae of the word vectors, so its responses will be way too obscure if I let it respond with any of the 400,000 words. So, I'm going to borrow a list of <a href="https://github.com/first20hours/google-10000-english">10,000 common english words</a> which was based on Google's Trillion Word Corpus.

In [42]:
common_words = lf.download('https://convergencerobot.s3-us-west-2.amazonaws.com/google-10000-english-usa.txt')

In [44]:
with open('data/google-10000-english-usa.txt') as f:
    common_words = []
    for line in f:
        common_words.append(line.strip())
len(common_words)

10000

In [52]:
# look at the word lemmas 
wl = WordNetLemmatizer()
lemmas = []
for word in common_words: 
    lemmas.append(wl.lemmatize(word))
lemmas[9030:]

['grateful',
 'emerald',
 'gradually',
 'laughing',
 'grows',
 'cliff',
 'desirable',
 'tract',
 'ul',
 'ballet',
 'ol',
 'journalist',
 'abraham',
 'j',
 'bumper',
 'afterwards',
 'webpage',
 'religion',
 'garlic',
 'hostel',
 'shine',
 'senegal',
 'explosion',
 'pn',
 'banned',
 'wendy',
 'brief',
 'signature',
 'diffs',
 'cove',
 'mumbai',
 'ozone',
 'discipline',
 'casa',
 'mu',
 'daughter',
 'conversation',
 'radio',
 'tariff',
 'nvidia',
 'opponent',
 'pasta',
 'simplified',
 'muscle',
 'serum',
 'wrapped',
 'swift',
 'motherboard',
 'runtime',
 'inbox',
 'focal',
 'bibliographic',
 'vagina',
 'eden',
 'distant',
 'incl',
 'champagne',
 'ala',
 'decimal',
 'hq',
 'deviation',
 'superintendent',
 'propecia',
 'dip',
 'nbc',
 'samba',
 'hostel',
 'housewife',
 'employ',
 'mongolia',
 'penguin',
 'magical',
 'influence',
 'inspection',
 'irrigation',
 'miracle',
 'manually',
 'reprint',
 'reid',
 'wt',
 'hydraulic',
 'centered',
 'robertson',
 'flex',
 'yearly',
 'penetration',
 'wo

In [58]:
#find unique lemmas 
len(lemmas)
unique_lemmas = set(lemmas)
len(unique_lemmas)

8420

In [59]:
# make sure all lemmas are in the vocabulary
valid_lemmas = []
invalid_lemmas = [] 

for word in unique_lemmas:
    if word in model.keys():
        valid_lemmas.append(word)
    else:
        invalid_lemmas.append(word)

invalid_lemmas 
# mostly little-used pornographic words so i'm fine with them being excluded 

['mailto',
 'beastality',
 'pichunter',
 'cumshot',
 'trembl',
 'listprice',
 'diffs',
 'handjob',
 'titten',
 'acdbentity',
 'milfhunter',
 'bufing',
 'knowledgestorm',
 'holdem',
 'ultram',
 'worldsex',
 'dealtime',
 'livesex',
 'postposted',
 'adipex',
 'blowjob',
 'starsmerchant',
 'powerseller',
 'findarticles',
 'msgid',
 'uniprotkb',
 'xnxx',
 'milfs',
 'techrepublic',
 'feof',
 'prostores',
 'vsnet',
 'transexuales',
 'upskirts',
 'handjobs',
 'basename',
 'ampland',
 'sublimedirectory',
 'msgstr',
 'expansys',
 'cumshots',
 'thumbzilla',
 'beastiality',
 'viewpicture',
 'jelsoft',
 'shemales',
 'voyuer',
 'href',
 'upskirt',
 'shemale',
 'trackback',
 'sbjct',
 'trackbacks',
 'cooky',
 'twinks',
 'thehun',
 'telecharger',
 'voyeurweb',
 'sexcam',
 'smilies',
 'permalink',
 'changelog',
 'endif',
 'bangbus',
 'fioricet',
 'verzeichnis',
 'qty',
 'gratuit',
 'msie',
 'livecam']

In [61]:
#get vectors for lemmas 
common_word_vectors = {}
unvectorized_common_words = []
for word in valid_lemmas:
    if word in model.keys():
        common_word_vectors[word] = model[word]
    else:
        unvectorized_common_words.append(word)

In [87]:
common_word_vectors['crumb']

KeyError: 'crumb'

In [88]:
common_word_vectors['brian']

array([[ 5.1011e-01, -1.7088e-01, -3.3469e-01,  7.1019e-02,  2.9698e-01,
         6.2398e-02, -3.9751e-02,  7.4622e-02, -3.3494e-01,  3.0440e-01,
         2.4743e-01, -3.5135e-01, -4.4545e-01, -1.8472e-01, -4.9552e-01,
         2.6418e-01, -3.8347e-01, -1.0576e-01,  2.1239e-01,  4.4608e-01,
        -1.2450e-01,  3.6269e-01, -1.7649e-01, -3.4512e-02,  3.2171e-01,
         1.2862e-01,  2.4097e-01, -2.5410e-02, -6.4458e-02, -4.0199e-01,
        -4.5147e-01, -2.5069e-01,  2.6809e-01, -4.5598e-02, -1.5802e+00,
        -1.4125e-01,  5.3310e-01,  5.5547e-01,  1.5146e-01,  2.9437e-02,
        -8.7299e-02,  1.0234e-01,  6.6576e-01,  4.8379e-01, -1.4345e-01,
         7.6252e-02, -4.0929e-01, -2.6415e-01,  4.3359e-03,  1.7206e-01,
        -2.3239e-01,  8.5678e-02, -2.4500e-02,  4.5492e-01, -1.0726e-01,
         6.4931e-01,  6.6062e-01,  3.2272e-01,  5.0098e-02, -3.4802e-01,
        -3.2089e-02,  2.6680e-03, -3.3535e-01,  8.1228e-01,  6.2673e-02,
        -4.2730e-01,  9.0446e-02, -2.6053e-01,  1.0

There are obviously some issues where a word that is well-known like crumb is not in the common word list, whereas the name "Brian" is. However, the more options there are to converge, likely the longer it will take to converge, if you do at all. I would need to collect evaluation data (that I'm unable to collect right now) in order for me to find the optimal <i>k</i> bot respones, so for now our 8,000+ words will do. 

In [26]:
# # pickle the common word vectors, if you wanted to
# # this is also available at https://convergencerobot.s3-us-west-2.amazonaws.com/common_word_vectors.pkl
# file_name = 'common_word_vectors.pkl'
# pickle.dump(common_word_vectors, open(file_name, 'wb'))

Now the common word vectors are loaded, and I show you the code above if you wanted to pickle it locally. There is a <a href="https://convergencerobot.s3-us-west-2.amazonaws.com/common_word_vectors.pkl"> saved version online</a>.