# Evaluation Notebook
This notebook provided some methods to do evaluate Wikisim on different datasets. 

# Configs

In [None]:
%%system

mysql -u root -pemilios -e 'set global key_buffer_size=4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global bulk_insert_buffer_size=1024*1024*1024;'
mysql -u root -pemilios -e 'set global query_cache_size = 4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global query_cache_limit = 4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global tmp_table_size = 4*1024*1024*1024;'

In [None]:
%%writefile config.py 
""" Config file.
"""
#%load_ext autoreload
#%autoreload

import os.path
import sys
import datetime
import logging
from wikipedia import *


home = os.path.expanduser("~");
dsdir = os.path.join(home ,"backup/projects/wikisim/datasets/similarity");
#dsdir = os.path.join(home ,"backup/projects/wikisim/datasets/similarity.orig");
workingdir = os.path.join(home , 'backup/tmp/');
baseresdir = path = os.path.join(workingdir, 'results')


logging.basicConfig(filename=os.path.join(workingdir,'myapp.log'), level=logging.INFO);    

    
def resdir(hitsver, direction=None):   
    path = os.path.join(baseresdir , graphtype(direction), hitsver);
    if not os.path.exists(path):
        os.makedirs(path)
    return path

    
def tmpdir(direction, hitsver):
    return os.path.join(resdir(hitsver, direction),'tmp/');

def graphdir(direction):    
    return os.path.join(getworkingdir , 'graphs' , wikisim.graphtypestr(direction));

def initdirs(direction, hitsver):
    path = resdir(hitsver, direction)
    if not os.path.exists(path):
        os.makedirs(path)
    path = tmpdir(direction, hitsver)
    if not os.path.exists(path):
        os.makedirs(path)
def printflush(*str):
    print str
    sys.stdout.flush()
    
    
def graphtype(direction):
    if direction is None:
        return ''
    if direction == DIR_IN:
        return 'in'
    if direction == DIR_OUT:
        return 'out'
    if direction == DIR_BOTH:
        return 'both' 

# Evaluating rvspage rank

In [3]:
#%%writefile rvseval.py 
""" Evaluating the method on Semantic Relatedness Datasets."""

#%load_ext autoreload
#%autoreload

import os
import time;
import pandas as pd


#%aimport wikipedia
#%aimport calcsim
from config import *
from calcsim import *
import gensim

import functools
# encodings: 
# ENTITY_TITLE 
# ENTITY_ID 
# ENTITY_ID_STR 
# ENTITY_ID_ID_STR 

#print DISABLE_CACHE
#clearcache()
direction=DIR_BOTH;
#method = 'word2vec.300.orig'
method = 'word2vec.300.ehsan'
#method = 'word2vec.500'
method = 'rvspagerank'
    
#word2vec_path = os.path.join(home, 'backup/wikipedia/WikipediaClean5Negative300Skip10.Ehsan/WikipediaClean5Negative300Skip10')
#word2vec_path = os.path.join(home, '/users/grad/sajadi/backup/wikipedia/20160305/embed/word2vec.enwiki-20160305-replace_surface.1.0.500.10.5.28.5.5/word2vec.enwiki-20160305-replace_surface.1.0.500.10.5.28.5.5')
#word2vec_path = os.path.join(home, '/users/grad/sajadi/backup/wikipedia/20160305/embed/word2vec.enwiki-20160305-cleantext.1.1.300.10.5.20.5.5/word2vec.enwiki-20160305-cleantext.1.1.300.10.5.20.5.5')
    
initdirs(direction, method)
resfilename =  os.path.join(baseresdir, 'reslog.txt')

dsfiles=('MC_28-edited.csv', 'RG-edited.csv', 'wsim353-edited.csv', 'Kore-edited.csv', 'MiniMayoSRS-edited.csv', 
         'MayoSRS-edited.csv', 'UMNSRS_relatedness-edited.csv', 'UMNSRS_similarity-edited.csv')

# dsfiles=('MC_28.orig.lower.csv', 'RG.orig.lower.csv', 'wsim353.orig.lower.csv', 'Kore-edited.csv','MiniMayoSRS.orig.lower.csv', 
#          'MayoSRS.orig.lower.csv', 'UMNSRS_relatedness.orig.lower.csv', 'UMNSRS_similarity.orig.lower.csv')

#dsfiles=('MiniMayoSRS-edited.csv',)

if 'word2vec' in method:
    wmodel = gensim_loadmodel(word2vec_path)
for dsname in dsfiles:
    start = time.time()
    
    printflush ("Processing",dsname)
    dsbase, dsext = os.path.splitext(dsname);
    infilename = os.path.join(dsdir, dsname)
    outfilename = os.path.join(resdir(method, direction), dsbase+ '.out'+ dsext)
    _ , corr = getsim_file(infilename, outfilename, method, direction, entity_encoding=ENTITY_ID);
    logres(resfilename, '%s\t%s\t%s\t%s\t%s', method, dsname, graphtype(direction), corr.correlation
                        , corr.pvalue)
    print corr
    print str(timeformat(int(time.time()-start)));
    
#close()   

('Processing', 'MC_28-edited.csv')
SpearmanrResult(correlation=0.89031905531366673, pvalue=2.2564285329371919e-10)
0:00:01
('Processing', 'RG-edited.csv')
SpearmanrResult(correlation=0.84421688077108881, pvalue=1.0207243116452706e-18)
0:00:01
('Processing', 'wsim353-edited.csv')
SpearmanrResult(correlation=0.72885743249106383, pvalue=2.9423734033404458e-54)
0:00:20
('Processing', 'Kore-edited.csv')
SpearmanrResult(correlation=-0.036923927641949808, pvalue=0.45041849668761935)
0:00:22
('Processing', 'MiniMayoSRS-edited.csv')
SpearmanrResult(correlation=0.7118456905339049, pvalue=1.4892847283146956e-05)
0:00:01
('Processing', 'MayoSRS-edited.csv')
SpearmanrResult(correlation=0.55551244607537598, pvalue=3.5078737404589433e-09)
0:00:02
('Processing', 'UMNSRS_relatedness-edited.csv')
SpearmanrResult(correlation=0.54885902861047531, pvalue=3.8752695000828581e-45)
0:00:10
('Processing', 'UMNSRS_similarity-edited.csv')
SpearmanrResult(correlation=0.61330572832593555, pvalue=3.3280934837948607e

# Embedding

In [1]:
#%%writefile embeval.py 
"""Testing batch embedding for a given file. """

import os
from scipy import stats
import time;
from config import *

%load_ext autoreload
%autoreload

%aimport calcsim
from calcsim import *



direction=DIR_OUT;
initdirs(direction, 'rvspagerank')
print "starte"
dsfiles=('MiniMayoSRS-edited.csv','MayoSRS-edited.csv')
start = time.time()
for dsname in dsfiles:
    printflush ("Processing",dsname)
    dsbase, dsext = os.path.splitext(dsname);
    infilename = os.path.join(dsdir, dsname)
    outfilename = os.path.join(resdir('rvspagerank', direction), dsbase+ '.emb'+ dsext)
    getembed_file(infilename, outfilename,direction, cutoff=10);
    
print str(timeformat(int(time.time()-start)));
#close()   

starte
('Processing', 'MiniMayoSRS-edited.csv')




('Processing', 'MayoSRS-edited.csv')
0:00:01


# Download Pre-embed

In [None]:
import os
import cPickle as pickle
import json
import MySQLdb

__author__ = "Armin Sajadi"

from json import encoder

#encoder.FLOAT_REPR = lambda o: format(o, '.2f') #Doesn't Work!
home = os.path.expanduser("~");

_db = MySQLdb.connect(host="127.0.0.1",port=3307,user='root',passwd="emilios",db="enwiki20160305")
_cursor = _db.cursor()

#dirstr = 'in'
dirstr = 'out'
emb_fname = os.path.join(home, 'backup/wikipedia/20160305/embed/enwiki-20160305-embeddings.'+dirstr+'.ssv')

if dirstr == 'in': 
    tablename = 'pagelinksorderedin';
elif dirstr == 'out': 
    tablename = 'pagelinksorderedout';


_cursor.execute("""SELECT * FROM `{0}`""".format(tablename))
rows = _cursor.fetchall();

with open(emb_fname, 'w') as emb_f:
    for row in rows:
        wid=str(row[0])
        values, index = pickle.loads(row[1])
        emb=json.dumps(dict(zip(index,values)))
        emb_f.write(wid+"\t"+emb+'\n')

emb_f.close()

print "done"

# Histogram of the dead embeddings

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import numpy as np
from config import *

dead_fname = os.path.join(home, 'backup/wikipedia/20160305/embed/enwiki-20160305-embeddings.in.dead.ssv')

data = pd.read_table(dead_fname, header=None, usecols=[2])
data = data[data[2] != 0]


dead_fname = os.path.join(home, 'backup/wikipedia/20160305/embed/enwiki-20160305-embeddings.in.dead.ssv')

bins = [0,3,4,5,6,7,8,9,10,20,30,40,50,100]
bins=[b*10000 for b in bins]
h = np.histogram(data,bins)
print "bins: ", h[1]
print "bined data: ", h[0]

plt.hist(data, bins, histtype='bar')
plt.title('size')

plt.tight_layout()
plt.show()


In [None]:
data.max()

# In-Out degree distribution

In [None]:
"""Generating some wiki statistics. """
import operator
import os
from collections import defaultdict
home = os.path.expanduser("~");
from wikipedia import *
with open(home+"/backup/projects/datasets/embed/allids.csv") as f:
    allids=set(line.strip() for line in f);
outcount = defaultdict();    
for id in allids:
    if not id.isdigit(): continue;
    outcount[id]=len(getlinkedpages(id,DIR_IN));
#o=w.getlinkedpages_query('None',Wikipedia.DIR_OUT);
#print o;
#w.cursor.execute(o)
#s=w.getlinkedpages_query(29953972,Wikipedia.DIR_OUT);
#outcount_sorted=sorted(outcount.items(), key=operator.itemgetter(1))
print outcount_sorted
close();    

In [None]:
c=1
print "id_419077" in word2vec_model.vocab


In [None]:
getsim_word2vec('boy', 'lad')

In [None]:
'lad' in wmodel.vocab