# Evaluation Notebook
This notebook provided some methods to do evaluate Wikisim on different datasets. 

# Configs

In [None]:
%%system

mysql -u root -pemilios -e 'set global key_buffer_size=4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global bulk_insert_buffer_size=1024*1024*1024;'
mysql -u root -pemilios -e 'set global query_cache_size = 4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global query_cache_limit = 4*1024*1024*1024;'
mysql -u root -pemilios -e 'set global tmp_table_size = 4*1024*1024*1024;'

In [None]:
%%writefile config.py 
""" Config file.
"""
#%load_ext autoreload
#%autoreload

import datetime
import logging
import os.path
import sys
from wikipedia import *


home = os.path.expanduser("~");
dsdir = os.path.join(home ,"backup/projects/wikisim/datasets/");
workingdir = os.path.join(home , 'backup/tmp/');
baseresdir = path = os.path.join(workingdir, 'results')

logging.basicConfig(filename=os.path.join(workingdir,'myapp.log'), level=logging.INFO);    

    
def resdir(hitsver, direction=None):   
    path = os.path.join(baseresdir , graphtype(direction), hitsver);
    if not os.path.exists(path):
        os.makedirs(path)
    return path

    
def tmpdir(direction, hitsver):
    return os.path.join(resdir(hitsver, direction),'tmp/');

def graphdir(direction):    
    return os.path.join(getworkingdir , 'graphs' , wikisim.graphtypestr(direction));

def initdirs(direction, hitsver):
    path = resdir(hitsver, direction)
    if not os.path.exists(path):
        os.makedirs(path)
    path = tmpdir(direction, hitsver)
    if not os.path.exists(path):
        os.makedirs(path)
def printflush(*str):
    print str
    sys.stdout.flush()
    
    
def graphtype(direction):
    if direction == None:
        return ''
    if direction == DIR_IN:
        return 'in'
    if direction == DIR_OUT:
        return 'out'
    if direction == DIR_BOTH:
        return 'both' 

# Evaluating rvspage rank

In [3]:
#%%writefile rvseval.py 
""" Evaluating the method on Semantic Relatedness Datasets."""

%load_ext autoreload
%autoreload

import os
import time;



%aimport wikipedia
%aimport calcsim
from config import *
from calcsim import *

print DISABLE_CACHE
direction=DIR_IN;
method = 'rvspagerank'
initdirs(direction, 'rvspagerank')
resfilename =  os.path.join(baseresdir, 'reslog.txt')

dsfiles=('MC_28-edited.csv', 'RG-edited.csv', 'MiniMayoSRS-edited.csv', 'MayoSRS-edited.csv',
        'UMNSRS_relatedness-edited.csv', 'UMNSRS_similarity-edited.csv')
dsfiles=('MiniMayoSRS-edited.csv', 'MayoSRS-edited.csv')
start = time.time()
for dsname in dsfiles:
    printflush ("Processing",dsname)
    dsbase, dsext = os.path.splitext(dsname);
    infilename = os.path.join(dsdir, dsname)
    outfilename = os.path.join(resdir(method, direction), dsbase+ '.out'+ dsext)
    _ , corr = getsim_file(infilename, outfilename, method, direction);
    logres(resfilename, 'rvspagerank\t%s\t%s\t%s\t%s', dsname, graphtype(direction), corr.correlation
                        , corr.pvalue)
    print corr
    
print str(timeformat(int(time.time()-start)));
#close()   

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
False
('Processing', 'MiniMayoSRS-edited.csv')
SpearmanrResult(correlation=0.68600374258755725, pvalue=3.9942691909824775e-05)
('Processing', 'MayoSRS-edited.csv')
SpearmanrResult(correlation=0.58118243427136129, pvalue=4.3357832073575697e-10)
0:01:16


# Embedding

In [2]:
"""Testing batch embedding for a given file. """

from scipy import stats
import os
import time;


%aimport calcsim
from calcsim import *



direction=DIR_IN;
initdirs(direction, 'rvspagerank')

dsfiles=('MC_28-edited.csv', 'MiniMayoSRS-edited.csv','MayoSRS-edited.csv')
start = time.time()
for dsname in dsfiles:
    printflush ("Processing",dsname)
    dsbase, dsext = os.path.splitext(dsname);
    infilename = os.path.join(dsdir, dsname)
    outfilename = os.path.join(resdir('rvspagerank', direction), dsbase+ '.emb'+ dsext)
    getembed_file(infilename, outfilename,direction, cutoff=3);
    
print str(timeformat(int(time.time()-start)));
#close()   

NameError: name 'initdirs' is not defined

# In-Out degree distribution

In [None]:
"""Generating some wiki statistics. """
import operator
from collections import defaultdict
with open(home+"/backup/projects/datasets/embed/allids.csv") as f:
    allids=set(line.strip() for line in f);
#outcount = defaultdict();    
for id in allids:
    if not id.isdigit(): continue;
    outcount[id]=len(getlinkedpages(id,Wikipedia.DIR_IN));
#o=w.getlinkedpages_query('None',Wikipedia.DIR_OUT);
#print o;
#w.cursor.execute(o)
#s=w.getlinkedpages_query(29953972,Wikipedia.DIR_OUT);
outcount_sorted=sorted(outcount.items(), key=operator.itemgetter(1))
print outcount_sorted

w.close();    

In [1]:
%load_ext autoreload
%autoreload



%aimport wikipedia
%aimport calcsim
from calcsim import *

print DISABLE_CACHE

False
