In [29]:
import sqlite3
import pandas as pd
import math
import numpy as np
from scipy.interpolate import interp1d

In [40]:
con = sqlite3.connect("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1.db")
df = pd.read_sql("SELECT * from data", con)

myProperties = [
"nmi",
"randMeasure",
"minCommunitySize",
"maxCommunitySize",
"avgCommunitySize",
"numCommunities",
"modularity",

"ccGlobal",
"ccAvgLocal",
"cc_spearman_p",
"cc_spearman_rho",
"cc_ks_d",
"cc_ks_p",
"cc_perDegree_ks_d",
"cc_perDegree_ks_p",
"cc_normalizedAbsDiff",
"cc_relRankError",

"diameter",

"dd_spearman_rho",
"dd_spearman_p",
"dd_powerLawFit",
"dd_relRankError",
"dd_ks_d",
"dd_ks_p",
"dd_distCoefficient",
"dd_normalizedAbsDiff",

"pagerank_spearman_rho",
"pagerank_spearman_p",
"pagerank_relRankError",

"wcc_nmi",
"wcc_count",
"wcc_sizes_ks",
"wcc_sizes_p",

"edgeRatio",
"nodeRatio",
"numEdges",
"numNodes"
"parameter",
"rt_backbone",
"rt_attribute",
"evalExpr"
]

In [41]:
#Prepare empty dataframe
myColumns = ["property, algorithm, graph, targetEdgeRatio"].append(myProperties)
df = pd.DataFrame(index=[], columns=myColumns)

#Expand dataframe
c = con.cursor()
indexMap = {}
currentIndex = 1
c.execute("SELECT algorithm, graph, targetEdgeRatio FROM data GROUP BY algorithm, graph, targetEdgeRatio")
data = c.fetchall()
for row in data:
    cAlgorithm = row[0]
    cGraph = row[1]
    cTER = row[2]
    df.loc[currentIndex, "algorithm"] = cAlgorithm
    df.loc[currentIndex, "graph"] = cGraph
    df.loc[currentIndex, "targetEdgeRatio"] = cTER
    indexMap[(cAlgorithm, cGraph, cTER)] = currentIndex
    currentIndex = currentIndex + 1

In [42]:
#Fill dataframe
c.execute("SELECT algorithm, graph, targetEdgeRatio, property, value FROM data")
data = c.fetchall()
currentRow = 0
for row in data:
    if currentRow % 10000 == 0:
        print("Processing row ", currentRow, "...")
    cAlgorithm = row[0]
    cGraph = row[1]
    cTER = row[2]
    cProperty = row[3]
    cValue = row[4]
    index = indexMap[(cAlgorithm, cGraph, cTER)]
    df.loc[index, cProperty] = cValue
    currentRow = currentRow + 1

Processing row  0 ...
Processing row  10000 ...
Processing row  20000 ...
Processing row  30000 ...
Processing row  40000 ...
Processing row  50000 ...
Processing row  60000 ...
Processing row  70000 ...
Processing row  80000 ...
Processing row  90000 ...
Processing row  100000 ...
Processing row  110000 ...
Processing row  120000 ...
Processing row  130000 ...
Processing row  140000 ...
Processing row  150000 ...
Processing row  160000 ...
Processing row  170000 ...
Processing row  180000 ...
Processing row  190000 ...
Processing row  200000 ...
Processing row  210000 ...
Processing row  220000 ...
Processing row  230000 ...
Processing row  240000 ...
Processing row  250000 ...
Processing row  260000 ...
Processing row  270000 ...
Processing row  280000 ...
Processing row  290000 ...
Processing row  300000 ...
Processing row  310000 ...
Processing row  320000 ...
Processing row  330000 ...
Processing row  340000 ...
Processing row  350000 ...
Processing row  360000 ...
Processing row 

In [43]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1_pandas.csv")

###### Averaging...

In [56]:
df = pd.read_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1_pandas.csv")

In [61]:
ters = [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
algorithms = ["Original", "Simmelian Parametric", "Local Similarity", "Simmelian Multiscale", "Random", "Local Degree", "ForestFire", "Degree Multiscale max", "Multiscale"]
graphGroupAll = ["Karate","USAviation","KitEmail","LFR-1000","PGP","BTER","ErdosRenyi","Jazz","HepTh","HepPh","Epinions","AS","eu-2005","in-2004","test.fiber.small","fb-American75","fb-Caltech36","fb-Haverford76","fb-Santa74","fb-UC33","fb-UMass92","fb-WashU32","fb-_Amherst41","fb-Carnegie49","fb-Howard90","fb-Northeastern19","fb-Simmons81","fb-UC61","fb-UNC28","fb-Wellesley22","fb-Auburn71","fb-Colgate88","fb-Indiana69","fb-Northwestern25","fb-Smith60","fb-UC64","fb-UPenn7","fb-Wesleyan43","fb-Baylor93","fb-Columbia2","fb-JMU79","fb-NYU9","fb-Stanford3","fb-UCF52","fb-USC35","fb-William77","fb-BC17","fb-Cornell5","fb-Lehigh96","fb-Oberlin44","fb-Swarthmore42","fb-UChicago30","fb-USF51","fb-Williams40","fb-Berkeley13","fb-Dartmouth6","fb-Maine59","fb-Oklahoma97","fb-Syracuse56","fb-UCLA26","fb-USFCA72","fb-Wisconsin87","fb-Bingham82","fb-Duke14","fb-Maryland58","fb-Temple83","fb-UConn91","fb-Yale4","fb-Bowdoin47","fb-Emory27","fb-Mich67","fb-Pepperdine86","fb-Tennessee95","fb-UCSB37","fb-Vanderbilt48","fb-Brandeis99","fb-FSU53","fb-Michigan23","fb-Princeton12","fb-Texas80","fb-UCSC68","fb-Vassar85","fb-Brown11","fb-Georgetown15","fb-Middlebury45","fb-Reed98","fb-Texas84","fb-UCSD34","fb-Vermont70","fb-BU10","fb-GWU54","fb-Mississippi66","fb-Rice31","fb-Trinity100","fb-UF21","fb-Villanova62","fb-Bucknell39","fb-Hamilton46","fb-MIT8","fb-Rochester38","fb-Tufts18","fb-UGA50","fb-Virginia63","fb-Cal65","fb-Harvard1","fb-MSU24","fb-Rutgers89","fb-Tulane29","fb-UIllinois20","fb-Wake73"]
graphGroupFacebook = ["fb-American75","fb-Auburn71","fb-BC17","fb-BU10","fb-Baylor93","fb-Berkeley13","fb-Bingham82","fb-Bowdoin47","fb-Brandeis99","fb-Brown11","fb-Bucknell39","fb-Cal65","fb-Caltech36","fb-Carnegie49","fb-Colgate88","fb-Columbia2","fb-Cornell5","fb-Dartmouth6","fb-Duke14","fb-Emory27","fb-FSU53","fb-GWU54","fb-Georgetown15","fb-Hamilton46","fb-Harvard1","fb-Haverford76","fb-Howard90","fb-Indiana69","fb-JMU79","fb-Lehigh96","fb-MIT8","fb-MSU24","fb-Maine59","fb-Maryland58","fb-Mich67","fb-Michigan23","fb-Middlebury45","fb-Mississippi66","fb-NYU9","fb-Northeastern19","fb-Northwestern25","fb-Oberlin44","fb-Oklahoma97","fb-Pepperdine86","fb-Princeton12","fb-Reed98","fb-Rice31","fb-Rochester38","fb-Rutgers89","fb-Santa74","fb-Simmons81","fb-Smith60","fb-Stanford3","fb-Swarthmore42","fb-Syracuse56","fb-Temple83","fb-Tennessee95","fb-Texas80","fb-Texas84","fb-Trinity100","fb-Tufts18","fb-Tulane29","fb-UC33","fb-UC61","fb-UC64","fb-UCF52","fb-UCLA26","fb-UCSB37","fb-UCSC68","fb-UCSD34","fb-UChicago30","fb-UConn91","fb-UF21","fb-UGA50","fb-UIllinois20","fb-UMass92","fb-UNC28","fb-UPenn7","fb-USC35","fb-USF51","fb-USFCA72","fb-Vanderbilt48","fb-Vassar85","fb-Vermont70","fb-Villanova62","fb-Virginia63","fb-Wake73","fb-WashU32","fb-Wellesley22","fb-Wesleyan43","fb-William77", "fb-Williams40", "fb-Wisconsin87", "fb-Yale4", "fb-_Amherst41"]

In [62]:
averageableProperties = [
#community
"randMeasure", "nmi", "ccAvgLocal", "modularity", "numCommunities",
#degree distribution
"dd_spearman_rho", "dd_ks_d",
#clustering coefficient
"ccGlobal", "ccAvgLocal", "cc_ks_d", "cc_perDegree_ks_d", "cc_spearman_rho", "cc_normalizedAbsDiff",
#weakly connected components
"wcc_nmi", "wcc_sizes_ks",
#pagerank
"pagerank_spearman_rho",
#diameter
"diameter"]

In [63]:
def getInterpolation(prop, algorithm, graph):
    df2 = df[(df.graph==graph) & (df.algorithm==algorithm)].sort(["edgeRatio"])
    edgeRatios = df2['edgeRatio']
    values = df2[prop]
    
    if algorithm == 'Original':
        return lambda x: values
    
    if len(edgeRatios) == 0 or len(values) == 0:
        return lambda x: exec('raise(ValueError("Missing data"))')
    
    if len(edgeRatios) < 2 or len(values) < 2:
        raise ValueError("arrays must have at least 2 entries.", prop,algorithm,graph)
    
    return interp1d(edgeRatios, values, kind='linear')

In [64]:
def aggregate(graphsToAggregate):
    rows = {}
    for algorithm in algorithms:        
        for prop in averageableProperties:
            functions_ratioToPropertyValue = []
            for graph in graphsToAggregate:
                functions_ratioToPropertyValue.append(getInterpolation(prop, algorithm, graph))
            
            for ratio in ters:
                myValues = []
                for interpFunction in functions_ratioToPropertyValue:
                    try:
                        interpolatedValue = float(interpFunction(ratio))
                        myValues.append(interpolatedValue)
                    except ValueError:
                        pass 
                
                if len(myValues) > 0 and (algorithm != 'Original' or ratio==1.0):
                    avgValue = np.average(myValues)
                    if not (algorithm, ratio) in rows:
                        rows[(algorithm, ratio)] = {}
                    rows[(algorithm, ratio)][prop] = avgValue
    return rows

In [65]:
def getRowDictionaries(aggregationResult, groupName):
    dictionaries = []
    for (algorithm, ratio) in aggregationResult:
        myDict = aggregationResult[(algorithm, ratio)]
        myDict['algorithm'] = algorithm
        myDict['targetEdgeRatio'] = ratio
        myDict['edgeRatio'] = ratio
        myDict['graph'] = groupName
        dictionaries.append(myDict)
    return dictionaries

In [66]:
aggregationResult = aggregate(graphGroupFacebook)
rowDictionaries = getRowDictionaries(aggregationResult, 'fb')
df = pd.concat([df, pd.DataFrame(rowDictionaries)])

In [67]:
aggregationResult = aggregate(graphGroupAll)
rowDictionaries = getRowDictionaries(aggregationResult, 'all')
df = pd.concat([df, pd.DataFrame(rowDictionaries)])

In [68]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc3_pandas.csv")

## Calulate a normalized distance measure for the number of communities

In [53]:
df[(df.graph=='fb-Caltech36') & (df.algorithm=='Local Similarity')]

Unnamed: 0.1,Unnamed: 0,algorithm,avgCommunitySize,ccAvgLocal,ccGlobal,cc_ks_d,cc_ks_p,cc_normalizedAbsDiff,cc_perDegree_ks_d,cc_perDegree_ks_p,...,pagerank_spearman_rho,parameter,randMeasure,rt_attribute,rt_backbone,targetEdgeRatio,wcc_count,wcc_nmi,wcc_sizes_ks,wcc_sizes_p
4970,4971,Local Similarity,5.870229,0.0,0.0,0.945384,9.477945e-303,0.409294,0.546185,0.003579557,...,0.999438,0.875,0.454431,0.008105,0.000457,0.01,131,0.973453,0.299618,0.802806
4971,4972,Local Similarity,5.870229,0.0,0.0,0.945384,9.477945e-303,0.409294,0.546185,0.003579557,...,0.999438,0.875,0.454431,0.008105,0.000405,0.02,131,0.973453,0.299618,0.802806
4972,4973,Local Similarity,9.26506,0.130515,0.102731,0.820546,3.536902e-228,0.403924,0.534137,0.0005491998,...,0.999762,0.827148,0.337956,0.008105,0.000398,0.05,68,0.934091,0.25,0.941413
4973,4974,Local Similarity,26.517241,0.300623,0.223667,0.452536,1.0671600000000001e-69,0.274691,0.362906,0.006881429,...,0.999994,0.666504,0.206892,0.008105,0.000429,0.1,15,0.756569,0.25,0.970132
4974,4975,Local Similarity,33.434783,0.370932,0.292588,0.261378,1.553174e-23,0.231769,0.334429,0.002095297,...,0.999997,0.563477,0.121698,0.008105,0.000435,0.15,11,0.64805,0.25,0.978087
4975,4976,Local Similarity,40.473684,0.410512,0.328999,0.140442,4.256921e-07,0.195919,0.350181,0.0002367442,...,0.999999,0.486328,0.071446,0.008105,0.000455,0.2,8,0.589689,0.25,0.985745
4976,4977,Local Similarity,42.722222,0.459112,0.420026,0.083225,0.009079743,0.170327,0.578956,4.547945e-13,...,0.999999,0.378515,0.067543,0.008105,0.000452,0.3,5,0.127702,0.25,0.994331
4977,4978,Local Similarity,51.266667,0.480452,0.443433,0.143043,2.39717e-07,0.153268,0.649757,4.032382e-18,...,0.999999,0.299683,0.050732,0.008105,0.000457,0.4,5,0.127702,0.25,0.994331
4978,4979,Local Similarity,54.928571,0.497319,0.423927,0.178153,3.672473e-11,0.152851,0.617097,3.652167e-19,...,0.999999,0.234192,0.047791,0.008105,0.000464,0.5,5,0.127702,0.25,0.994331
4979,4980,Local Similarity,54.928571,0.504148,0.421326,0.188557,1.881491e-12,0.141209,0.483637,9.683161e-15,...,0.999999,0.178345,0.036924,0.008105,0.000461,0.6,5,0.127702,0.25,0.994331


In [59]:
for graph in graphGroupAll:
    originalCommunityCount = float(df[(df.graph=='fb-Caltech36') & (df.algorithm=='Original')]["numCommunities"])
    
    df['numCommunitiesSim'] = df['numCommunities'].map(lambda communityCount: math.exp(-(communityCount/originalCommunityCount)))

In [60]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc2_pandas.csv")