In [1]:
#Import modules
import pandas as pd
import numpy as np
from tqdm import tqdm
from categorization import fscore
import torch
from torch import nn
import scipy
import time
#For each pair in df_simil_rates, calculate distance and append in the DataFrame
def update_df(dataframe,elem_num,embeddings):
    l=[]
    #n=0
    for i in range(len(dataframe)):
        word1=dataframe.iloc[i]['word1']
        word2=dataframe.iloc[i]['word2']
        
        #calculate distance
        try:
            pos_word1 = elem_num[word1]
            pos_word2 = elem_num[word2]
        except KeyError:
            #n+=1
            pass
        
        #dis = torch.sqrt(sum((embeddings[pos_word1]-embeddings[pos_word2])**2))
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        dis=cos(embeddings[pos_word1],embeddings[pos_word2])
        #print(word1,word2,dis)
        l.append(dis.item())
    dataframe['dist']=l
    #print(n)
    return dataframe

#Finally, we calculate Spearman's rho
def rho(dataframe,elem_num,embeddings,rtype):
    dataframe = update_df(dataframe,elem_num,embeddings) 
    #Calculate d
    if rtype == 'semantic':
       a=dataframe['semantic'].tolist()
       b=dataframe['dist'].tolist()
    elif rtype == 'visual':
       a=dataframe['visual'].tolist()
       b=dataframe['dist'].tolist()
    """
    #Protection against NaN
    for i,elem in enumerate(b):
        b[i]+=0.000000000000000000000000001*i
        print(b[i])
    """
    #Calculate rho
    coef, p= scipy.stats.spearmanr(a,b,axis=0)
    return coef, p

def evaluate(embeddings):
#EVALUATE
#from spearmansrho import rho
    df_simil_rates = pd.read_csv('./data/similarity-ratings.tsv',sep='\t')
    num2elem=np.load('./data/Visual_att_row_keys.npy')
    ele2num = {a:b for b,a in enumerate(num2elem)}

    #sem, p =rho(df_simil_rates,ele2num,embeddings,'semantic')
    #print('Semantic similarity is '+str(sem))
###############################################################


    sem,p=rho(df_simil_rates,ele2num,embeddings,'semantic')
    print('Semantic similarity is '+str(sem))
    
    sem,p=rho(df_simil_rates,ele2num,embeddings,'visual')
    print('Visual similarity is '+str(sem))

def evaluate_cat(embeddings):
#EVALUATE
#from spearmansrho import rho
    num2elem=np.load('./data/Visual_att_row_keys.npy')
    cat=fscore(embeddings,'./data/my_trevor_hardClusters.csv',num2elem)
    print('Categorization (Fscore rate) is '+str(cat))
    
    
def unormc(X0):
    X=X0.clone()
    # normalize to unit norm
    nrm=np.zeros(X.size()[0])
    for i in range(0,X.size()[0]):
        nrm[i]=np.sqrt(sum(X[i,:]**2))
        if nrm[i] is not 0:
            X[i,:]=X[i,:]/nrm[i]
            
    # center
    mv = X.mean(axis=0)
    for i in range(0,X.size()[0]):
        X[i,:]=X[i,:] - mv
    return X, nrm, mv

def evaluate_norm(embeddings):
#EVALUATE
    df_simil_rates = pd.read_csv('./data/similarity-ratings.tsv',sep='\t')
    num2elem=np.load('./data/Visual_att_row_keys.npy')
    ele2num = {a:b for b,a in enumerate(num2elem)}

    emb_n,_,_ = unormc(embeddings)
    sem_t,p=rho(df_simil_rates,ele2num,emb_n,'semantic')
    print('Semantic similarity is '+str(sem_t))
    
    sem_v,p=rho(df_simil_rates,ele2num,emb_n,'visual')
    print('Visual similarity is   '+str(sem_v))
    return sem_t, sem_v

def reduce_to_short(RMAT):
    num_to_elem=np.load('./data/Visual_att_row_keys.npy')
#    ele_to_num = {a:b for b,a in enumerate(num_to_elem)}
#    num_to_elem_reduced=[]
    vis_vec=RMAT
    vis_vec_reduced=[]
    with open('./data/mariella_mcrae_wordpairs.txt','r') as w:
        k=0
        for l in num_to_elem:
            is_there = False
            w.seek(0, 0)
            for line in w:
                words = line.split("#")
                #check if this pair is in df_simil_rates
                #print(l,words[0],words[1][:-1])
                #if so copy it to another file
                if (words[0]== l or  words[1][:-1] == l) :
                    is_there = True
#                    num_to_elem_reduced.append(l)
                    if k==0:
                        vis_vec_reduced=vis_vec[k:k+1][:]
                    else:
                        vis_vec_reduced=np.concatenate((vis_vec_reduced,vis_vec[k:k+1]),axis=0)
                    break
            k+=1
    return vis_vec_reduced

def evaluate_short(embeddings):
#EVALUATE
    df_simil_rates = pd.read_csv('./data/short_similarity-ratings.tsv',sep='\t')
    num2elem=np.load('Visual_att_row_keys_short.npy')
    ele2num = {a:b for b,a in enumerate(num2elem)}

    sem,p=rho(df_simil_rates,ele2num,embeddings,'semantic')
    print('Semantic similarity is '+str(sem))
    
    sem,p=rho(df_simil_rates,ele2num,embeddings,'visual')
    print('Visual similarity is '+str(sem))
    
def evaluate_norm_short(embeddings):
#EVALUATE
    df_simil_rates = pd.read_csv('./data/short_similarity-ratings.tsv',sep='\t')
    num2elem=np.load('./data/Visual_att_row_keys_short.npy')
    ele2num = {a:b for b,a in enumerate(num2elem)}

    emb_n,_,_ = unormc(embeddings)
    sem_t,p=rho(df_simil_rates,ele2num,emb_n,'semantic')
    print('Semantic similarity is '+str(sem_t))
    
    sem_v,p=rho(df_simil_rates,ele2num,emb_n,'visual')
    print('Visual similarity is   '+str(sem_v))
    return sem_t, sem_v

In [2]:
class DGE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        
        self.param2 = kwargs["param2"]
        self.aa = kwargs["aa"]
        self.niter = kwargs["niter"]
        self.LR = kwargs["LR"]
        self.NCL = kwargs["NCL"]
        self.Cdamp = kwargs["Cdamp"]

    def forward(self,Xi,Go,Gi):
        
        #--- 1 graph embedding update
        Xi.requires_grad=True
        Xi=GD_torch_biW(Xi,Go,Gi,self.param2,self.aa,self.niter,self.LR)
        Xi=Xi.detach().clone()
        Xi = norm11(Xi.T).double().T

        #--- 2 graph structure update: semantic prior
        kmeans = KMeans(n_clusters=self.NCL, random_state=0, init='k-means++').fit(Xi.cpu().numpy())
        CC0=torch.tensor([kmeans.labels_]).transpose(1,0) #estimate semantic communities

        CIM = class_sim_torch(CC0).to(device)
        CIM = (CIM==0).double() + self.Cdamp*(CIM!=0).double()
        #Gi=Gi*CIM #encode semantic similarity in graph 
        #return Xi,Gi
        return Xi,CIM

class DGE_leapfrog(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        
        self.param2 = kwargs["param2"]
        self.aa = kwargs["aa"]
        self.cc = kwargs["cc"]
        self.niter = kwargs["niter"]
        self.LR = kwargs["LR"]
        self.NCL = kwargs["NCL"]
        self.Cdamp = kwargs["Cdamp"]

    def forward(self,Xi,Go,Gi,Giext):
        
        #--- 1 graph embedding update
        Xi.requires_grad=True
        Xi=GD_torch_triW(Xi,Go,Gi,Giext,self.param2,self.aa,self.cc,self.niter,self.LR)
        Xi=Xi.detach().clone()
        Xi = norm11(Xi.T).double().T

        #--- 2 graph structure update: semantic prior
        kmeans = KMeans(n_clusters=self.NCL, random_state=0, init='k-means++').fit(Xi.cpu().numpy())
        CC0=torch.tensor([kmeans.labels_]).transpose(1,0) #estimate semantic communities

        CIM = class_sim_torch(CC0).to(device)
        CIM = (CIM==0).double() + self.Cdamp*(CIM!=0).double()
        #Gi=Gi*CIM #encode semantic similarity in graph 
        #return Xi,Gi
        return Xi,CIM

In [3]:
#Import modules
from cdist import distx_fun, cosdist
from similarity import simz_fun, simz_fun_sym
from pca import PCA
from normalize import norm01,norm11,class_sim_torch
#from loss import sim_grad_descent_autograd, sim_grad_descent_biW_autograd
#from loss import sim_grad_descent_biW_softmax_autograd
from loss import X_entropy, GD_torch, GD_torch_biW, GD_torch_triW
from cross_entropy import my_softmax
from graph import data, data_textual, data_sg
import numpy as np
import torch
from torch.autograd import Variable
from sklearn.cluster import KMeans
import torch.nn.functional as FTR
import time
import matplotlib.pyplot as plt
import pandas as pd

OUT_PATH = '/Users/hwendt/Dropbox/SHARED_HWMD_UNSHARED/DGE_multimodal/DGE_results/Missing/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

start_time = time.time()




cpu


In [4]:
NORMINIT=0

start = time.time()
# VISUAL
#Hyper-parameters
d_emb=15
niter_e=1000
param11,param21=0.0011,0.095 #l #smaller is not good
LR10=0.0004

# TEXTUAL
#Hyper-parameters
param12,param22=0.0125,0.125
LR2=4e-3


In [6]:
# load joint DGE embedding results for textual and visual and embedd in one single graph

start = time.time()
import random
random.seed=1979
torch.manual_seed=1979
torch.cuda.manual_seed=1979

niter = 1#2*250
N_iter_G = 5
LR = 1e-4
a_mix = 0.5
a_mix_0 = a_mix

param2 = 16*0.125 * (param21 + param22)

sgstr = 'sg_'; s1=5
#sgstr = ''; s1=9
norstr='nor_'
norstr=''

MSTR='T_'
#MSTR='V_'
#MSTR='T_V_'

KK=20

if s1==5: #3:
    param2=0.3 
    LR1=0.001 
    niter_g1=5
    NCL1=20
    aa1=0.05 
    Cdamp1=0.7 

if s1==9:
    param2=0.3 
    LR1=0.001 
    niter_g1=4 
    NCL1=6 
    aa1=0.1 
    Cdamp1=0.7 

#model1 = DGE_leapfrog(param2=param21,aa=aa1,niter=niter_g1,LR=LR1,NCL=NCL1,cc=cc1,Cdamp=Cdamp1)
model1 = DGE(param2=param21,aa=aa1,niter=niter_g1,LR=LR1,NCL=NCL1,Cdamp=Cdamp1)

P = []
start = time.time()
for nDGE in range(s1,s1+1):
    Xi1 = torch.tensor(np.load(OUT_PATH + sgstr  + norstr + MSTR + 'text_d15_2_cc12_it' +str(nDGE) +'.npy'))
    Xi2 = torch.tensor(np.load(OUT_PATH + sgstr  + norstr + MSTR + 'vis_d15_2_cc12_it' +str(nDGE) +'.npy'))
    
#    print( sgstr  + norstr + 'text_d15_2_cc12_it' +str(nDGE) +'.npy')
#    print( sgstr  + norstr + 'vis_d15_2_cc12_it' +str(nDGE) +'.npy')
    
    
    Gi1=simz_fun_sym(cosdist(Xi1),param21)
    Gi2=simz_fun_sym(cosdist(Xi2),param22)
    
    #Xi=a_mix_0 * torch.tensor(Xi1).detach().clone() + (1-a_mix_0) * torch.tensor(Xi2).detach().clone()
    Xi=torch.cat((Xi1.to("cpu"),Xi2.to("cpu")),axis=1)
    Gi0=simz_fun_sym(cosdist(Xi),param2); Gi=Gi0
    Xi.requires_grad=True
    
    for K in range(KK):
#        Xi,CIM1 = model1(Xi,Gi1,Gi2); Gi1=Gi1*CIM1; Gi2=Gi2*CIM1
        Xi,CIM = model1(Xi,Gi,Gi0); Gi=Gi*CIM
        
        Xi=Xi.detach().clone()
        Xi = norm11(Xi.T).double().T
        np.save(OUT_PATH + sgstr  + norstr + 'postembed_d15_it' +str(nDGE) +'_' +str(K) +'.npy', Xi)
        Xis=torch.tensor(reduce_to_short(Xi.to("cpu"))).type(torch.DoubleTensor)
        print(K)
        P.append(evaluate_norm_short(Xis))
        #np.save(OUT_PATH + sgstr  + norstr + 'postembed_d15_it' +str(nDGE) +'_' +str(K) +'.npy', Xi)
    print(str(nDGE) + " --- elapsed time: " +  str(time.time()-start))


0
Semantic similarity is 0.7733588734307394
Visual similarity is   0.6438063715371533
1
Semantic similarity is 0.7732159073590872
Visual similarity is   0.6442061809849327
2
Semantic similarity is 0.773233693382574
Visual similarity is   0.644740371602176
3
Semantic similarity is 0.7730638418991679
Visual similarity is   0.6451734644184914
4
Semantic similarity is 0.7728209793981977
Visual similarity is   0.6455929165525627
5


KeyboardInterrupt: 