In [74]:
import pandas as pd #might want to use it
import numpy as np #working with arrays/matrices after all! 
import random #random choice for bot's first word
from sklearn.metrics.pairwise import cosine_similarity
import convergence_resources as cr
import nltk
from nltk.stem import WordNetLemmatizer

# Using GloVe

I used word vectors from a pre-trained set of data consisting of <a href="https://catalog.ldc.upenn.edu/LDC2011T07">Gigawords</a> and Wikimedia dumps.



In [75]:
def load_glove_model(file):
    print("Loading...")
    f = open(file,'r')
    model = {}
    for line in f:
        whole_line = line.split()
        words = whole_line[0]
        embedding = np.array([float(val) for val in whole_line[1:]])
        model[words] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [76]:
model = load_glove_model('glove.6B/glove.6B.300d.txt')

Loading...
Done. 400000  words loaded!


# Common English Words

Create a list of 10,000 common english words from https://github.com/first20hours/google-10000-english.
Think about adding popular companies, etc. to list later! 

In [77]:
with open('google-10000-english/google-10000-english-usa.txt') as f:
    cwordlist = []
    for line in f:
        cwordlist.append(line.strip())
cwordlist

['the',
 'of',
 'and',
 'to',
 'a',
 'in',
 'for',
 'is',
 'on',
 'that',
 'by',
 'this',
 'with',
 'i',
 'you',
 'it',
 'not',
 'or',
 'be',
 'are',
 'from',
 'at',
 'as',
 'your',
 'all',
 'have',
 'new',
 'more',
 'an',
 'was',
 'we',
 'will',
 'home',
 'can',
 'us',
 'about',
 'if',
 'page',
 'my',
 'has',
 'search',
 'free',
 'but',
 'our',
 'one',
 'other',
 'do',
 'no',
 'information',
 'time',
 'they',
 'site',
 'he',
 'up',
 'may',
 'what',
 'which',
 'their',
 'news',
 'out',
 'use',
 'any',
 'there',
 'see',
 'only',
 'so',
 'his',
 'when',
 'contact',
 'here',
 'business',
 'who',
 'web',
 'also',
 'now',
 'help',
 'get',
 'pm',
 'view',
 'online',
 'c',
 'e',
 'first',
 'am',
 'been',
 'would',
 'how',
 'were',
 'me',
 's',
 'services',
 'some',
 'these',
 'click',
 'its',
 'like',
 'service',
 'x',
 'than',
 'find',
 'price',
 'date',
 'back',
 'top',
 'people',
 'had',
 'list',
 'name',
 'just',
 'over',
 'state',
 'year',
 'day',
 'into',
 'email',
 'two',
 'health',
 '

In [138]:
#look at the word stems 
wl = WordNetLemmatizer()

In [139]:
lemmas = []
for word in cwordlist: 
    lemmas.append(wl.lemmatize(word))
lemmas[9030:]

['grateful',
 'emerald',
 'gradually',
 'laughing',
 'grows',
 'cliff',
 'desirable',
 'tract',
 'ul',
 'ballet',
 'ol',
 'journalist',
 'abraham',
 'j',
 'bumper',
 'afterwards',
 'webpage',
 'religion',
 'garlic',
 'hostel',
 'shine',
 'senegal',
 'explosion',
 'pn',
 'banned',
 'wendy',
 'brief',
 'signature',
 'diffs',
 'cove',
 'mumbai',
 'ozone',
 'discipline',
 'casa',
 'mu',
 'daughter',
 'conversation',
 'radio',
 'tariff',
 'nvidia',
 'opponent',
 'pasta',
 'simplified',
 'muscle',
 'serum',
 'wrapped',
 'swift',
 'motherboard',
 'runtime',
 'inbox',
 'focal',
 'bibliographic',
 'vagina',
 'eden',
 'distant',
 'incl',
 'champagne',
 'ala',
 'decimal',
 'hq',
 'deviation',
 'superintendent',
 'propecia',
 'dip',
 'nbc',
 'samba',
 'hostel',
 'housewife',
 'employ',
 'mongolia',
 'penguin',
 'magical',
 'influence',
 'inspection',
 'irrigation',
 'miracle',
 'manually',
 'reprint',
 'reid',
 'wt',
 'hydraulic',
 'centered',
 'robertson',
 'flex',
 'yearly',
 'penetration',
 'wo

In [115]:
len(lemmas)

10000

In [120]:
lemma_series = pd.Series(lemmas)

In [141]:
#find only unique lemmas 
unique_lemmas = lemma_series.unique()
list(unique_lemmas)

#make sure all lemmas are in the vocabulary
valid_lemmas = []
invalid_lemmas = [] 

for word in unique_lemmas:
    if word in model.keys():
        valid_lemmas.append(word)
    else:
        invalid_lemmas.append(word)

invalid_lemmas

['shemale',
 'permalink',
 'trackback',
 'verzeichnis',
 'blowjob',
 'holdem',
 'sexcam',
 'milfhunter',
 'qty',
 'uniprotkb',
 'beastiality',
 'upskirt',
 'milfs',
 'cooky',
 'livecam',
 'cumshot',
 'postposted',
 'cumshots',
 'trembl',
 'dealtime',
 'titten',
 'jelsoft',
 'expansys',
 'upskirts',
 'starsmerchant',
 'ampland',
 'twinks',
 'handjob',
 'transexuales',
 'prostores',
 'xnxx',
 'thumbzilla',
 'pichunter',
 'thehun',
 'livesex',
 'worldsex',
 'msgid',
 'msgstr',
 'adipex',
 'sublimedirectory',
 'viewpicture',
 'href',
 'endif',
 'trackbacks',
 'shemales',
 'voyeurweb',
 'gratuit',
 'mailto',
 'acdbentity',
 'bangbus',
 'listprice',
 'knowledgestorm',
 'handjobs',
 'smilies',
 'findarticles',
 'vsnet',
 'msie',
 'techrepublic',
 'diffs',
 'feof',
 'basename',
 'bufing',
 'sbjct',
 'powerseller',
 'changelog',
 'fioricet',
 'voyuer',
 'beastality',
 'ultram',
 'telecharger']

In [158]:
#find small lemmas to make sure they are valid words 
lemmas_df = pd.DataFrame(valid_lemmas)

In [163]:
lemmas_df.columns = ['lemma']
lemmas_df.head()

Unnamed: 0,lemma
0,the
1,of
2,and
3,to
4,a


In [172]:
short_lemmas = lemmas_df[lemmas_df['lemma'].apply(lambda x: len(x) < 3)]
len(short_lemmas), short_lemmas.tail()
#I'm going to leave these for now, hoping that the vectors never result in these 405 words unless it must 

(405,      lemma
 8186    lu
 8223    lm
 8283    cz
 8300    hl
 8311    ob)

In [173]:
common_word_vectors = {}
unvectorized_common_words = []
for word in lemmas:
    if word in model.keys():
        common_word_vectors[word] = model[word]
    else:
        unvectorized_common_words.append(word)
unvectorized_common_words

['shemale',
 'permalink',
 'trackback',
 'verzeichnis',
 'blowjob',
 'holdem',
 'sexcam',
 'milfhunter',
 'qty',
 'uniprotkb',
 'beastiality',
 'upskirt',
 'milfs',
 'blowjob',
 'cooky',
 'livecam',
 'cumshot',
 'postposted',
 'cumshots',
 'trembl',
 'dealtime',
 'titten',
 'jelsoft',
 'expansys',
 'upskirts',
 'starsmerchant',
 'ampland',
 'twinks',
 'handjob',
 'transexuales',
 'prostores',
 'xnxx',
 'thumbzilla',
 'pichunter',
 'thehun',
 'livesex',
 'worldsex',
 'msgid',
 'msgstr',
 'adipex',
 'sublimedirectory',
 'viewpicture',
 'href',
 'endif',
 'trackbacks',
 'shemales',
 'voyeurweb',
 'gratuit',
 'mailto',
 'acdbentity',
 'bangbus',
 'listprice',
 'knowledgestorm',
 'handjobs',
 'smilies',
 'findarticles',
 'vsnet',
 'msie',
 'techrepublic',
 'diffs',
 'feof',
 'basename',
 'bufing',
 'sbjct',
 'powerseller',
 'changelog',
 'fioricet',
 'voyuer',
 'beastality',
 'ultram',
 'telecharger']

In [174]:
#reshape the dictionary arrays 
common_word_vectors['truth'].shape

(1, 300)

In [175]:
for key, value in common_word_vectors.items():
    common_word_vectors[key] = value.reshape(1,-1)
    
for key, value in model.items():
    model[key] = value.reshape(1,-1)

common_word_vectors['truth'].shape

(1, 300)

# Create Functions

In [187]:
def converge(user_input=str, bot_input=str, exclude=None):
    """Return the "average word" of the input words."""
    if exclude is None:
        exclude = set()
    exclude.add(user_input)
    exclude.add(bot_input)  
    mean_vector = ((6*model[user_input] + 2*model[bot_input])/2)
    cos_sim_dict = {}
    response_options_dict = {key: common_word_vectors[key] for key in common_word_vectors 
                             if key not in exclude}
    for word, vector in response_options_dict.items():
        cos_sim_dict[float(cosine_similarity(mean_vector, vector))] = word
    max_cos_sim = max(cos_sim_dict.keys())
    bot_response = cos_sim_dict[max_cos_sim]
    return bot_response

In [251]:
def play_round(user_input, user_history=None, bot_history=None):
    if bot_history is None:
        user_history = []
        bot_history = []
        bot_response = random.choice(lemmas)
    else:
        bot_response = converge(user_history[-1], bot_history[-1], 
                                exclude=set(user_history + bot_history))
    user_history.append(user_input)
    bot_history.append(bot_response)
    print(f"You said {user_input}, but I said {bot_response}.")
    print(f"What's the convergence of {user_input} and {bot_response}?")   
    print(f"Remember, don't repeat what you said: {user_history} or what I said: {bot_history}")
    return {
        'user_history': user_history,
        'bot_history': bot_history,
        'bot_response': bot_response,
    }

In [254]:
def play_convergence():
    print("Let's play convergence. Type a word!") 
    user_input = input()
    round_results = play_round(user_input)
    while True:
        user_input = input()
        round_results = play_round(user_input)
        if user_input == round_results['bot_response']:
            print(f'CONVERGENCE!! You said {user_input}, and I said {bot_response}!!! WE GOT CONVERGENCE!!!!')
            break

In [255]:
play_convergence()

Let's play convergence. Type a word!
try
You said try, but I said marvel.
What's the convergence of try and marvel?
Remember, don't repeat what you said: ['try'] or what I said: ['marvel']
awe
You said awe, but I said davis.
What's the convergence of awe and davis?
Remember, don't repeat what you said: ['awe'] or what I said: ['davis']


KeyboardInterrupt: 

In [None]:
play_convergence