In [354]:
from pulp import *
import gensim
import numpy as np

In [355]:
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


In [356]:
dict={'word':wordsList,'wordvec':wordVectors}


In [357]:
sent_1="I told my dad to go after my siblings"
sent_2 ="Mom asked me to get my brother and sister from work"

In [358]:
#pre-process the data. tokenize->remove stop words ->lemma
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en')

def spacy_filter(msg):
    
    msg=msg.lower()
    msg=nlp(msg)
    list=[w.lemma_ for w in msg if not (w.pos_ in ['DET','PUNCT','SYM','PRON'] or w in STOP_WORDS)]
    list=[w for w in list if not (w in STOP_WORDS)]

    #new_msg=' '.join(list)
    #print ('hello',new_msg)
    ##input()

    return list





In [359]:
#pre-processing functions
filtered_sent_2=spacy_filter(sent_2)
filtered_sent_1=spacy_filter(sent_1)



In [360]:
#smaller length sent to s1
if len(filtered_sent_1)>len(filtered_sent_2):
    s1=filtered_sent_2
    s2=filtered_sent_1
else:
    s2=filtered_sent_2
    s1=filtered_sent_1

In [361]:
s2

['mom', 'ask', '-PRON-', 'brother', 'sister', 'work']

In [362]:
#checks for the cossine similarity
def check_simil(a,b):
    return np.dot(wordVectors[wordsList.index(a)], wordVectors[wordsList.index(b)])/(np.linalg.norm(wordVectors[wordsList.index(a)])* np.linalg.norm(wordVectors[wordsList.index(b)]))

In [363]:
def returnWordVec(word,wordsList=wordsList,wordVectors=wordVectors):
    try:
        i=wordsList.index(word)
        return wordVectors[i,:]
    except Exception as e:
        print ("word not found")
            

In [364]:
def returnWordVec(word,wordsList=wordsList,wordVectors=wordVectors):
    try:
        i=wordsList.index(word)
        return True
    except Exception as e:
        return False

In [365]:
wordVectors[1,:]

array([ 0.013441  ,  0.23682   , -0.16899   ,  0.40950999,  0.63812   ,
        0.47709   , -0.42851999, -0.55641001, -0.36399999, -0.23938   ,
        0.13000999, -0.063734  , -0.39574999, -0.48162001,  0.23291001,
        0.090201  , -0.13324   ,  0.078639  , -0.41633999, -0.15428001,
        0.10068   ,  0.48890999,  0.31226   , -0.1252    , -0.037512  ,
       -1.51789999,  0.12612   , -0.02442   , -0.042961  , -0.28351   ,
        3.54159999, -0.11956   , -0.014533  , -0.1499    ,  0.21864   ,
       -0.33412001, -0.13872001,  0.31806001,  0.70358002,  0.44858   ,
       -0.080262  ,  0.63002998,  0.32111001, -0.46765   ,  0.22786   ,
        0.36034   , -0.37818   , -0.56656998,  0.044691  ,  0.30392   ], dtype=float32)

In [366]:
returnWordVec("Silicon")


False

In [367]:
#removing words not present in the wordVector list
for w1 in s1:
    if(returnWordVec(w1)==False):
            s1.remove(w1)
            
for w2 in s2:

    if(returnWordVec(w2)==False):
        s2.remove(w2)

In [368]:
def setCost(s1,s2):
    cost=np.zeros((len(s1),len(s2)))
    for w1 in s1:
        for w2 in s2:
            
            cost[s1.index(w1)][s2.index(w2)]=check_simil(w1,w2)
    return cost       

In [369]:
costs=setCost(s1,s2)

In [370]:
s1


['tell', 'dad', 'sibling']

In [371]:
s2

['mom', 'ask', 'brother', 'sister', 'work']

In [372]:
"""
s1=['silicon','gelato','velvety']
s2=['italian','ice-cream','rich']
costs=[
        [1,5,9],
        [8,1,8],
        [7,2,1]
]
"""

"\ns1=['silicon','gelato','velvety']\ns2=['italian','ice-cream','rich']\ncosts=[\n        [1,5,9],\n        [8,1,8],\n        [7,2,1]\n]\n"

In [373]:
#taking np.abs to compare just the magnitudes and no direction in the vector space
cost_dict=makeDict([s1,s2],np.abs(costs),0)


In [374]:
costs


array([[ 0.74576539,  0.9085933 ,  0.53068131,  0.52667749,  0.64633483],
       [ 0.90403229,  0.60015941,  0.67111242,  0.62630081,  0.41700187],
       [ 0.42191681,  0.12228175,  0.39654291,  0.55023712,  0.10764812]])

In [375]:
prob = LpProblem("Word assign Problem",LpMaximize)

In [376]:
Mapping = [(w,b) for w in s1 for b in s2]

In [377]:
Mapping


[('tell', 'mom'),
 ('tell', 'ask'),
 ('tell', 'brother'),
 ('tell', 'sister'),
 ('tell', 'work'),
 ('dad', 'mom'),
 ('dad', 'ask'),
 ('dad', 'brother'),
 ('dad', 'sister'),
 ('dad', 'work'),
 ('sibling', 'mom'),
 ('sibling', 'ask'),
 ('sibling', 'brother'),
 ('sibling', 'sister'),
 ('sibling', 'work')]

In [378]:
vars = LpVariable.dicts("cost",(s1,s2),lowBound=0, upBound=1, cat='Integer')

In [379]:
vars

{'dad': {'ask': cost_dad_ask,
  'brother': cost_dad_brother,
  'mom': cost_dad_mom,
  'sister': cost_dad_sister,
  'work': cost_dad_work},
 'sibling': {'ask': cost_sibling_ask,
  'brother': cost_sibling_brother,
  'mom': cost_sibling_mom,
  'sister': cost_sibling_sister,
  'work': cost_sibling_work},
 'tell': {'ask': cost_tell_ask,
  'brother': cost_tell_brother,
  'mom': cost_tell_mom,
  'sister': cost_tell_sister,
  'work': cost_tell_work}}

In [380]:
prob += lpSum([vars[w][b]*cost_dict[w][b] for (w,b) in Mapping])

In [381]:
for w in s1:
    prob += lpSum([vars[w][b] for b in s2])==1

In [382]:
result=prob.solve()

In [383]:
prob

Word assign Problem:
MAXIMIZE
0.600159406662*cost_dad_ask + 0.671112418175*cost_dad_brother + 0.904032289982*cost_dad_mom + 0.626300811768*cost_dad_sister + 0.417001873255*cost_dad_work + 0.122281752527*cost_sibling_ask + 0.396542906761*cost_sibling_brother + 0.421916812658*cost_sibling_mom + 0.550237119198*cost_sibling_sister + 0.10764811933*cost_sibling_work + 0.908593297005*cost_tell_ask + 0.530681312084*cost_tell_brother + 0.745765388012*cost_tell_mom + 0.526677489281*cost_tell_sister + 0.646334826946*cost_tell_work + 0.0
SUBJECT TO
_C1: cost_tell_ask + cost_tell_brother + cost_tell_mom + cost_tell_sister
 + cost_tell_work = 1

_C2: cost_dad_ask + cost_dad_brother + cost_dad_mom + cost_dad_sister
 + cost_dad_work = 1

_C3: cost_sibling_ask + cost_sibling_brother + cost_sibling_mom
 + cost_sibling_sister + cost_sibling_work = 1

VARIABLES
0 <= cost_dad_ask <= 1 Integer
0 <= cost_dad_brother <= 1 Integer
0 <= cost_dad_mom <= 1 Integer
0 <= cost_dad_sister <= 1 Integer
0 <= cost_dad_w

In [384]:
print("Status:", LpStatus[prob.status])

Status: Optimal


In [385]:
prob.objective

LpAffineExpression([(cost_tell_mom, 0.74576538801193237),
                    (cost_tell_ask, 0.90859329700469971),
                    (cost_tell_brother, 0.530681312084198),
                    (cost_tell_sister, 0.52667748928070068),
                    (cost_tell_work, 0.64633482694625854),
                    (cost_dad_mom, 0.90403228998184204),
                    (cost_dad_ask, 0.6001594066619873),
                    (cost_dad_brother, 0.67111241817474365),
                    (cost_dad_sister, 0.62630081176757813),
                    (cost_dad_work, 0.417001873254776),
                    (cost_sibling_mom, 0.42191681265830994),
                    (cost_sibling_ask, 0.1222817525267601),
                    (cost_sibling_brother, 0.39654290676116943),
                    (cost_sibling_sister, 0.55023711919784546),
                    (cost_sibling_work, 0.10764811933040619)])

In [386]:
for v in prob.variables():
    print(v.name, "=", v.varValue)

cost_dad_ask = 0.0
cost_dad_brother = 0.0
cost_dad_mom = 1.0
cost_dad_sister = 0.0
cost_dad_work = 0.0
cost_sibling_ask = 0.0
cost_sibling_brother = 0.0
cost_sibling_mom = 0.0
cost_sibling_sister = 1.0
cost_sibling_work = 0.0
cost_tell_ask = 1.0
cost_tell_brother = 0.0
cost_tell_mom = 0.0
cost_tell_sister = 0.0
cost_tell_work = 0.0


In [387]:
value(prob.objective)

2.3628627061843872