In [4]:
from rdflib import Graph, RDF, URIRef, Namespace
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
import experiment as exp


In [2]:
g = Graph()
g.parse("../graph/experiment_graph.ttl", format="turtle")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
localNS = Namespace("http://deepweb.ut.ee/ontologies/api-network#")

In [3]:
ns = dict(api_network=localNS, cat= Namespace("http://www.programmableweb.com/category/"), 
          rdf = RDF, gr = Namespace("http://purl.org/goodrelations/v1#"), pw_api = Namespace("http://www.programmableweb.com/api/") )

###Preparing diffusion matrix###

In [49]:
def count_num_of_transitions(cat1, cat2):
    r = g.query("""SELECT ?s ?m WHERE { ?s ?p api_network:API .
                    ?m rdf:type api_network:Mashup .
                    ?m gr:include ?s .
                    ?s api_network:primaryCategory|api_network:secondaryCategory cat:%s. 
                    ?m api_network:tag cat:%s}""" % (cat1, cat2) , initNs=ns)
    return len(r)
import time
start_time = time.time()
print(count_num_of_transitions("mapping", "england"))
print("execution time: %s " % (time.time() - start_time))

168
execution time: 0.08255815505981445 ----


In [11]:
def count_popular_categories():
    r = g.query("""SELECT ?c (COUNT(?c) as ?catCount) WHERE  {
                    ?s api_network:primaryCategory|api_network:secondaryCategory ?c .
                    }
                    GROUP BY ?c""", initNs=ns)
    frame = DataFrame()
    frame["Category"] = [t["?c"].toPython() for t in r.bindings]
    frame["Count"] = [t["?catCount"].toPython() for t in r.bindings]
    #frame = frame.sort("Count", ascending = 0)
    selected = frame[frame["Count"]>0]["Category"] #Series of top used categories
    return selected

In [12]:
selected = count_popular_categories()

In [17]:
selected = [s.replace("http://www.programmableweb.com/category/", "") for s in selected]

In [93]:
#run at home to get a complete version
import itertools
result = []
for pair in itertools.product(selected, repeat=2):
    result.append(list(pair)+[count_num_of_transitions(*pair)])

In [96]:
diffusion_frame = DataFrame(result, columns = ["API Category", "Mashup Category", "Num of Transactions"])

In [97]:
diffusion_frame_for_weights = diffusion_frame.pivot("API Category", "Mashup Category", "Num of Transactions")

In [165]:
#diffusion_frame_for_weights.to_csv("diffusion_matrix.csv")
diffusion_frame_for_weights = pd.read_csv("diffusion_matrix.csv", index_col=0)
diffusion_frame_for_weights = diffusion_frame_for_weights.astype(float)
#diffusion_frame_for_weights.dtypes

In [166]:
for column in diffusion_frame_for_weights:
    diagonal = diffusion_frame_for_weights[column][column]
    maximum = diffusion_frame_for_weights[column].max()
    if diffusion_frame_for_weights[column].sum() != 0: # if sum of column not 0
        if diagonal != 0: # if there is something on diagonal
            for i, val in diffusion_frame_for_weights[column].iteritems():
                diffusion_frame_for_weights.set_value(i, column, float(val)/diagonal)
        else:
            for i, val in diffusion_frame_for_weights[column].iteritems():
                diffusion_frame_for_weights.set_value(i, column, float(val)/maximum)        

In [168]:
#diffusion_frame_for_weights.to_csv("weighted_diffusion_matrix.csv")

In [205]:
df_weights = pd.read_csv("weighted_diffusion_matrix.csv", index_col=0)

In [207]:
df_weights["3d"]["3d"]

1.0

###Testing SC with Diffusion matrix###

In [179]:
mashup = "http://www.programmableweb.com/mashup/semaflickr"

In [180]:
query = exp.compose_query(g, mashup)

In [183]:
S = query["categories"]
M = exp.candidate_set(query, g)
actual = query["services"]
k=5

In [184]:
result = exp.Greedy(g, S, M, k ,1, 0, 1)

In [186]:
services = result[0]

In [210]:
print(exp.sc(services, S, g))
print(sc_with_weights(services, S, g, diffusion_frame_for_weights))

1.3
2.08230054034


In [208]:
def intersections_with_weight(categories, S, diffusion_frame_for_weights):
    total_weight = 0
    categories = [s.replace("http://www.programmableweb.com/category/", "") for s in categories]
    S = [s.replace("http://www.programmableweb.com/category/", "") for s in S] 
    for cat in categories:
        for s in S:
            #if cat == s:
            #    total_weight+=1
            #else:
            total_weight += diffusion_frame_for_weights[cat][s]
    return total_weight

In [201]:
def get_categories(services, g):
    primary_categories = []
    secondary_categories = []
    for service in services:
        rows = g.query("""SELECT ?c WHERE {
            <%s> api_network:primaryCategory ?c .}""" % service, initNs=ns)
        p_cats = [t["?c"] for t in rows.bindings]
        primary_categories = primary_categories + list(set(p_cats) - set(primary_categories))
        rows2 = g.query("""SELECT ?c WHERE {
            <%s> api_network:secondaryCategory ?c .}""" % service, initNs=ns)
        s_cats = [t["?c"] for t in rows2.bindings]
        secondary_categories = secondary_categories + list(set(s_cats) - set(secondary_categories))
    return (primary_categories, secondary_categories)

def get_intersections_with_weights(services, S, g, diffusion_frame_for_weights):
    services_categories = get_categories(services, g)
    primary_intersection_weight = intersections_with_weight(services_categories[0], S, diffusion_frame_for_weights)#[val for val in services_categories[0] if val in S]
    secondary_intersection_weight = intersections_with_weight(services_categories[1], S, diffusion_frame_for_weights)#[val for val in services_categories[1] if val in S]
    return (primary_intersection_weight, secondary_intersection_weight)

In [204]:
def sc_with_weights(services, S, g, diffusion_frame_for_weights):
    # services_categories = get_categories(services)
    # primary_intersection = [val for val in services_categories[0] if val in S]
    # secondary_intersection = [val for val in services_categories[1] if val in S]
    intersections = get_intersections_with_weights(services, S, g, diffusion_frame_for_weights)
    return (intersections[0]+0.3*intersections[1])/len(S)  # number of requested categories in service is devided by size of requested categories

In [267]:
import datetime
rows = g.query("""SELECT ?m ?d WHERE {?m ?p api_network:Mashup .
                  ?m api_network:registrationDate ?d .}""", initNs=ns)
mashup_regs = DataFrame()
mashup_regs["Mashup"] = [t["?m"].toPython() for t in rows.bindings]
mashup_regs["Registration"] = [t["?d"].toPython() for t in rows.bindings]
mashup_for_experiment = mashup_regs[mashup_regs["Registration"] > datetime.date(2015, 6, 13)]["Mashup"]  # select 65 mashups
len(mashup_for_experiment)

10

In [214]:
def get_services_by_category(category, reg_date, g):
    rows = g.query("""SELECT DISTINCT ?s WHERE {?s ?p api_network:API .
                ?s api_network:primaryCategory|api_network:secondaryCategory cat:%s .
                ?m ?p api_network:Mashup .
                ?m gr:include ?s .
                }""" % category, initNs=ns)
    return [t["?s"] for t in rows.bindings]

In [249]:
def candidate_set_with_weights(query, g, diffusion_frame_for_weights):
    involved_cats = []
    for q in [q.replace("http://www.programmableweb.com/category/", "") for q in query["categories"]]:  
        involved_cats = involved_cats + list(d for d in diffusion_frame_for_weights[q].index if diffusion_frame_for_weights[q][d] > 0)
    involved_cats = list(set(involved_cats))    
    set_of_services = []
    for category in involved_cats:
        services = get_services_by_category(category, query["reg_date"], g)
        set_of_services = set_of_services + list(set(services) - set(set_of_services))
    return set_of_services

In [216]:
len(candidate_set(query, g))

224

In [250]:
len(candidate_set_with_weights(query, g, diffusion_frame_for_weights))

1490

In [256]:
def candidate_set_all_activated(g):
    rows = g.query("""SELECT DISTINCT ?s WHERE {?s ?p api_network:API .
                    ?m ?p api_network:Mashup .
                    ?m gr:include ?s .
                    }""", initNs=ns)
    return [t["?s"] for t in rows.bindings]

In [257]:
activated = candidate_set_all_activated(g)

In [258]:
len(activated)

1491

In [240]:
involved_cats = []
for q in [q.replace("http://www.programmableweb.com/category/", "") for q in query["categories"]]:  
    involved_cats = involved_cats + list(d for d in diffusion_frame_for_weights[q].index if diffusion_frame_for_weights[q][d] > 0)

In [245]:
print(len(involved_cats))
print(len(list(set(involved_cats))))

360
233
