In [2]:
from rdflib import Graph, RDF, URIRef, Namespace
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
import math
import operator
import datetime
import pandas as pnd

In [2]:
g = Graph()
g.parse("../graph/experiment_graph.ttl", format="turtle")

<Graph identifier=N3e02622b5f204b41af90d2dafb1bb428 (<class 'rdflib.graph.Graph'>)>

In [3]:
foaf = Namespace("http://xmlns.com/foaf/0.1/")
ns = dict(api_network=Namespace("http://deepweb.ut.ee/ontologies/api-network#"), 
          cat= Namespace("http://www.programmableweb.com/category/"), 
          rdf = RDF, gr = Namespace("http://purl.org/goodrelations/v1#"), 
          pw_api = Namespace("http://www.programmableweb.com/api/"),
          xsd = Namespace("http://www.w3.org/2001/XMLSchema#"))

In [4]:
def compose_query(mashup): # return tuple of number of required services,list of required categories and registration date
    rows = g.query("""SELECT ?t WHERE {
                <%s> api_network:tag ?t .}""" % mashup, initNs=ns) # query select tags by Mashup name
    categories = [t["?t"] for t in rows.bindings]
    rows = g.query("""SELECT ?t WHERE {
                <%s> gr:include ?t .}""" % mashup, initNs=ns)
    k = len(rows)
    rows = g.query("""SELECT ?d WHERE {
                    <%s> api_network:registrationDate ?d .
                    }""" % mashup, initNs=ns)
    reg_date = [t["?d"].toPython() for t in rows.bindings][0]
    rows = g.query("""SELECT ?s WHERE {
                <%s> ?p api_network:Mashup .
                <%s> gr:include ?s .}""" % (mashup, mashup), initNs=ns) # query select tags by Mashup name
    involved_services = [t["?s"] for t in rows.bindings]
    return (k, categories, reg_date, involved_services)

In [36]:
def union_of_two_lists(first_list, second_list):
    in_first = set(first_list)
    in_second = set(second_list)
    in_second_but_not_in_first = in_second - in_first
    result = first_list + list(in_second_but_not_in_first)
    return result

def score(services, S):
    def get_categories(services):
        primary_categories = []
        secondary_categories = []
        for service in services:
            rows = g.query("""SELECT ?c WHERE {
                <%s> api_network:primaryCategory ?c .}""" % service, initNs=ns)
            p_cats = [t["?c"] for t in rows.bindings]
            primary_categories = primary_categories + list(set(p_cats) - set(primary_categories))
            rows2 = g.query("""SELECT ?c WHERE {
                <%s> api_network:secondaryCategory ?c .}""" % service, initNs=ns)
            s_cats = [t["?c"] for t in rows2.bindings]
            secondary_categories = secondary_categories + list(set(s_cats) - set(secondary_categories))
        return (primary_categories, secondary_categories)
    
    def get_intersections(services, S):
        services_categories = get_categories(services)
        primary_intersection = [val for val in services_categories[0] if val in S]
        secondary_intersection = [val for val in services_categories[1] if val in S]
        return (primary_intersection, secondary_intersection)

    def sc(services, query):
        #services_categories = get_categories(services)
        #primary_intersection = [val for val in services_categories[0] if val in S]
        #secondary_intersection = [val for val in services_categories[1] if val in S]
        intersections = get_intersections(services, S)
        return (len(intersections[0])+0.3*len(intersections[1]))/len(S) # number of requested categories in service is devided by size of requested categories

    def re(services, query):
        #services_categories = get_categories(services)
        #primary_intersection = [val for val in services_categories[0] if val in S]
        #secondary_intersection = [val for val in services_categories[1] if val in S]
        #intersection = [val for val in services_categories if val in S]
        intersections = get_intersections(services, S)
        categories = get_categories(services)
        provided_categories = union_of_two_lists(categories[0], categories[1])
        return len(intersections[0])+0.3*len(intersections[1])#)/len(provided_categories) # number of requested categories in service is devided by size of provided categories

    def activation_time_mapping_function(n_days):
        x = np.arange(0.,3.,0.001) # define range
        y = np.exp(4*(-x)) # define exponential function, b=4 just an example
        return y[n_days]
    
    def dt_for_single_service(service):
        rows = g.query("""SELECT ?d WHERE {
                    <%s> api_network:registrationDate ?d .
                    }""" % service, initNs=ns)
        reg_date = [t["?d"].toPython() for t in rows.bindings][0]
        rows = g.query("""SELECT ?d1 WHERE {
                    ?m gr:include <%s> .
                    ?m api_network:registrationDate ?d1 .
                    }""" % service, initNs=ns)
        mashup_reg_date = [t["?d1"].toPython() for t in rows.bindings]
        if not mashup_reg_date:
            activation_time = 0 # if service has not been activated, then activation_time is 0
        else:
            mrd = Series(mashup_reg_date) # turn list to Series
            first_mashup_registration_date = mrd[mrd>=reg_date].min() # get rid of negative registrations
            try:
                activation_time = first_mashup_registration_date - reg_date
                activation_time = activation_time.days # convert from timedelta to days
                activation_time = activation_time_mapping_function(activation_time) # mapping from days to score
            except:
                activation_time = 0
        return activation_time
    
    def dt(services):
        result = 0
        for service in services:
            result = result + dt_for_single_service(service)
        return result
    return [sc(services, S), re(services, S), 0.3*dt(services)]

In [8]:
mashup = "http://www.programmableweb.com/mashup/8by1"
service = "http://www.programmableweb.com/api/google-maps"
user_query = compose_query(mashup)
sum(score([service], user_query[1]))

1.7988023968031974

In [15]:
#select services which provides one or more categories
# TODO add filter date
def get_services_by_category(category, reg_date):
    rows = g.query("""SELECT DISTINCT ?s WHERE {?s ?p api_network:API .
                ?s api_network:primaryCategory|api_network:secondaryCategory cat:%s .
                ?m ?p api_network:Mashup .
                ?m gr:include ?s .
                }""" % category, initNs=ns)
    return [t["?s"] for t in rows.bindings]

def candidate_set(query):
    reg_date = query[2]
    set_of_services = []
    for category in query[1]:
        category = category.toPython().replace("http://www.programmableweb.com/category/", "")
        services = get_services_by_category(category, query[2])
        set_of_services = set_of_services + list(set(services) - set(set_of_services))
    return set_of_services

In [16]:
mashup = "http://www.programmableweb.com/mashup/packagemapper.com"
user_query = compose_query(mashup)
M = candidate_set(user_query) # set of candidate services
S = user_query[1] # set of requested tags
k = user_query[0] # length of required recommendation set
print(k)
print(S)
print(len(M))

3
[rdflib.term.URIRef('http://www.programmableweb.com/category/mapping'), rdflib.term.URIRef('http://www.programmableweb.com/category/shipping')]
158


In [17]:
#main experiment
def Greedy(S, M, k, e=0.1):
    I = [] # recommendation set of services
    B = 0 # current score
    for i in range(k):
        B_table=[]
        for service in M:
            temp_I = I + [service]
            curr_score = score(temp_I, S)
            B_table.append((temp_I, sum(curr_score)))
        max_elem = max(B_table, key=operator.itemgetter(1)) 
        I = max_elem[0]
        #print(I)
        B = max_elem[1]
        #print(max_elem[1])
        M = [x for x in M if x not in I] # delete added element from candidate set
    return (I, B)
    
            
        

In [34]:
result = Greedy(S, M, k)

[rdflib.term.URIRef('http://www.programmableweb.com/api/zeemaps')]
2.5
[rdflib.term.URIRef('http://www.programmableweb.com/api/zeemaps'), rdflib.term.URIRef('http://www.programmableweb.com/api/shipstation')]
3.5
[rdflib.term.URIRef('http://www.programmableweb.com/api/zeemaps'), rdflib.term.URIRef('http://www.programmableweb.com/api/shipstation'), rdflib.term.URIRef('http://www.programmableweb.com/api/globexplorer')]
4.49600798934


In [35]:
result

([rdflib.term.URIRef('http://www.programmableweb.com/api/zeemaps'),
  rdflib.term.URIRef('http://www.programmableweb.com/api/shipstation'),
  rdflib.term.URIRef('http://www.programmableweb.com/api/globexplorer')],
 4.4960079893439913)

In [18]:
def compute_precision(result, query):
    actual_services = query[3]
    intersection = [val for val in actual_services if val in result[0]]
    return len(intersection)/len(result[0])
    
def compute_recall(result, query):
    actual_services = query[3]
    intersection = [val for val in actual_services if val in result[0]]
    return len(intersection)/len(actual_services)

In [29]:
#lets try to emulate a single experiment
def single_experiment(mashup):
    user_query = compose_query(mashup)
    M = candidate_set(user_query) # set of candidate services
    S = user_query[1] # set of requested tags
    k = user_query[0] # length of required recommendation set
    result = Greedy(S, M, k)
    #print(result)
    precision = compute_precision(result, user_query)
    recall = compute_recall(result, user_query)
    return (precision, recall) # return precision and recall

In [20]:
# select mashups for experiment
rows = g.query("""SELECT ?m ?d WHERE {?m ?p api_network:Mashup .
                    ?m api_network:registrationDate ?d .}""" , initNs=ns)
mashup_regs = DataFrame()
mashup_regs["Mashup"] = [t["?m"].toPython() for t in rows.bindings]
mashup_regs["Registration"] = [t["?d"].toPython() for t in rows.bindings]

In [39]:
#mashup_regs.sort("Registration", ascending = 0)
mashup_for_experiment = mashup_regs[mashup_regs["Registration"]>datetime.date(2015, 1, 1)]["Mashup"] #select 65 mashups
len(mashup_for_experiment)

16

In [37]:
precision_recall_frame = DataFrame()
for mashup in mashup_for_experiment:
    precision,recall = single_experiment(mashup)
    df = DataFrame([[mashup, precision, recall]], columns = ["Mashup", "Precision", "Recall"])
    precision_recall_frame = precision_recall_frame.append(df, ignore_index=True)

In [40]:
#precision_recall_frame

In [52]:
services = ["http://www.programmableweb.com/api/geoplugin", "http://www.programmableweb.com/api/zeemaps", "http://www.programmableweb.com/api/globexplorer"]
score(services, S)

[0.65, 1.1, 0.74022922592493667]

In [53]:
services = ["http://www.programmableweb.com/api/fedex", "http://www.programmableweb.com/api/google-maps", "http://www.programmableweb.com/api/yahoo-geocoding"]
score(services, S)

[1.0, 0.4, 0.77992822142872187]

#Run experiment for 65 mashups published after 01-01-2015#

In [5]:
table = pnd.read_csv("precision_recall_df.csv")
table

Unnamed: 0.1,Unnamed: 0,Mashup,Precision,Recall
0,0,http://www.programmableweb.com/mashup/anything...,1.000000,1.000000
1,1,http://www.programmableweb.com/mashup/moonotes,0.000000,0.000000
2,2,http://www.programmableweb.com/mashup/ifixit-a...,0.000000,0.000000
3,3,http://www.programmableweb.com/mashup/maco-tra...,1.000000,1.000000
4,4,http://www.programmableweb.com/mashup/magento-...,0.000000,0.000000
5,5,http://www.programmableweb.com/mashup/placeprint,0.000000,0.000000
6,6,http://www.programmableweb.com/mashup/idealist...,0.000000,0.000000
7,7,http://www.programmableweb.com/mashup/sap-fico...,0.000000,0.000000
8,8,http://www.programmableweb.com/mashup/weglore-...,0.000000,0.000000
9,9,http://www.programmableweb.com/mashup/equaldex...,0.333333,0.333333


In [4]:
table[table["Precision"]>0]

Unnamed: 0.1,Unnamed: 0,Mashup,Precision,Recall
0,0,http://www.programmableweb.com/mashup/anything...,1.0,1.0
3,3,http://www.programmableweb.com/mashup/maco-tra...,1.0,1.0
9,9,http://www.programmableweb.com/mashup/equaldex...,0.333333,0.333333
10,10,http://www.programmableweb.com/mashup/99design...,0.5,0.5
18,18,http://www.programmableweb.com/mashup/zilyo-va...,0.2,0.2
22,22,http://www.programmableweb.com/mashup/ibeeking,1.0,1.0
39,39,http://www.programmableweb.com/mashup/windshie...,1.0,1.0
40,40,http://www.programmableweb.com/mashup/intellin...,0.5,0.5
48,48,http://www.programmableweb.com/mashup/stel-order,1.0,1.0
49,49,http://www.programmableweb.com/mashup/mashit,1.0,1.0


In [151]:
import operator
#max(b_table,key=itemgetter(1))[0]
max(result, key=operator.itemgetter(1))

([rdflib.term.URIRef('http://www.programmableweb.com/api/zeemaps')], 2.25)

In [122]:
import operator
stats = {'a':3000, 'b':3000, 'c': 100}
sorted_x = sorted(stats.items(), key=operator.itemgetter(1))
sorted_x
#b_table[max(b_table, key=b_table.get)]

[('c', 100), ('a', 3000), ('b', 3000)]