In [2]:
import sqlite3
import pandas as pd
import math
import numpy as np
from scipy.interpolate import interp1d

In [3]:
con = sqlite3.connect("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1.db")
df = pd.read_sql("SELECT * from data", con)

myProperties = [
"nmi",
"randMeasure",
"minCommunitySize",
"maxCommunitySize",
"avgCommunitySize",
"numCommunities",
"modularity",

"ccGlobal",
"ccAvgLocal",
"cc_spearman_p",
"cc_spearman_rho",
"cc_ks_d",
"cc_ks_p",
"cc_perDegree_ks_d",
"cc_perDegree_ks_p",
"cc_normalizedAbsDiff",
"cc_relRankError",

"diameter",

"dd_spearman_rho",
"dd_spearman_p",
"dd_powerLawFit",
"dd_relRankError",
"dd_ks_d",
"dd_ks_p",
"dd_distCoefficient",
"dd_normalizedAbsDiff",

"pagerank_spearman_rho",
"pagerank_spearman_p",
"pagerank_relRankError",

"wcc_nmi",
"wcc_count",
"wcc_sizes_ks",
"wcc_sizes_p",

"edgeRatio",
"nodeRatio",
"numEdges",
"numNodes"
"parameter",
"rt_backbone",
"rt_attribute",
"evalExpr"
]

In [4]:
#Prepare empty dataframe
myColumns = ["property, algorithm, graph, targetEdgeRatio"].append(myProperties)
df = pd.DataFrame(index=[], columns=myColumns)

#Expand dataframe
c = con.cursor()
indexMap = {}
currentIndex = 1
c.execute("SELECT algorithm, graph, targetEdgeRatio FROM data GROUP BY algorithm, graph, targetEdgeRatio")
data = c.fetchall()
for row in data:
    cAlgorithm = row[0]
    cGraph = row[1]
    cTER = row[2]
    df.loc[currentIndex, "algorithm"] = cAlgorithm
    df.loc[currentIndex, "graph"] = cGraph
    df.loc[currentIndex, "targetEdgeRatio"] = cTER
    indexMap[(cAlgorithm, cGraph, cTER)] = currentIndex
    currentIndex = currentIndex + 1

In [42]:
#Fill dataframe
c.execute("SELECT algorithm, graph, targetEdgeRatio, property, value FROM data")
data = c.fetchall()
currentRow = 0
for row in data:
    if currentRow % 10000 == 0:
        print("Processing row ", currentRow, "...")
    cAlgorithm = row[0]
    cGraph = row[1]
    cTER = row[2]
    cProperty = row[3]
    cValue = row[4]
    index = indexMap[(cAlgorithm, cGraph, cTER)]
    df.loc[index, cProperty] = cValue
    currentRow = currentRow + 1

Processing row  0 ...
Processing row  10000 ...
Processing row  20000 ...
Processing row  30000 ...
Processing row  40000 ...
Processing row  50000 ...
Processing row  60000 ...
Processing row  70000 ...
Processing row  80000 ...
Processing row  90000 ...
Processing row  100000 ...
Processing row  110000 ...
Processing row  120000 ...
Processing row  130000 ...
Processing row  140000 ...
Processing row  150000 ...
Processing row  160000 ...
Processing row  170000 ...
Processing row  180000 ...
Processing row  190000 ...
Processing row  200000 ...
Processing row  210000 ...
Processing row  220000 ...
Processing row  230000 ...
Processing row  240000 ...
Processing row  250000 ...
Processing row  260000 ...
Processing row  270000 ...
Processing row  280000 ...
Processing row  290000 ...
Processing row  300000 ...
Processing row  310000 ...
Processing row  320000 ...
Processing row  330000 ...
Processing row  340000 ...
Processing row  350000 ...
Processing row  360000 ...
Processing row 

In [43]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1_pandas.csv")

###### Averaging...

In [3]:
df = pd.read_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc1_pandas.csv")

In [8]:
ters = [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
algorithms = ["Original", "Simmelian Parametric", "Local Similarity", "Simmelian Multiscale", "Random", "Local Degree", "ForestFire", "Degree Multiscale max", "Multiscale"]
graphGroupAll = ["Karate","USAviation","KitEmail","LFR-1000","PGP","BTER","ErdosRenyi","Jazz","HepTh","HepPh","Epinions","AS","eu-2005","in-2004","test.fiber.small","fb-American75","fb-Caltech36","fb-Haverford76","fb-Santa74","fb-UC33","fb-UMass92","fb-WashU32","fb-_Amherst41","fb-Carnegie49","fb-Howard90","fb-Northeastern19","fb-Simmons81","fb-UC61","fb-UNC28","fb-Wellesley22","fb-Auburn71","fb-Colgate88","fb-Indiana69","fb-Northwestern25","fb-Smith60","fb-UC64","fb-UPenn7","fb-Wesleyan43","fb-Baylor93","fb-Columbia2","fb-JMU79","fb-NYU9","fb-Stanford3","fb-UCF52","fb-USC35","fb-William77","fb-BC17","fb-Cornell5","fb-Lehigh96","fb-Oberlin44","fb-Swarthmore42","fb-UChicago30","fb-USF51","fb-Williams40","fb-Berkeley13","fb-Dartmouth6","fb-Maine59","fb-Oklahoma97","fb-Syracuse56","fb-UCLA26","fb-USFCA72","fb-Wisconsin87","fb-Bingham82","fb-Duke14","fb-Maryland58","fb-Temple83","fb-UConn91","fb-Yale4","fb-Bowdoin47","fb-Emory27","fb-Mich67","fb-Pepperdine86","fb-Tennessee95","fb-UCSB37","fb-Vanderbilt48","fb-Brandeis99","fb-FSU53","fb-Michigan23","fb-Princeton12","fb-Texas80","fb-UCSC68","fb-Vassar85","fb-Brown11","fb-Georgetown15","fb-Middlebury45","fb-Reed98","fb-Texas84","fb-UCSD34","fb-Vermont70","fb-BU10","fb-GWU54","fb-Mississippi66","fb-Rice31","fb-Trinity100","fb-UF21","fb-Villanova62","fb-Bucknell39","fb-Hamilton46","fb-MIT8","fb-Rochester38","fb-Tufts18","fb-UGA50","fb-Virginia63","fb-Cal65","fb-Harvard1","fb-MSU24","fb-Rutgers89","fb-Tulane29","fb-UIllinois20","fb-Wake73"]
graphGroupFacebook = ["fb-American75","fb-Auburn71","fb-BC17","fb-BU10","fb-Baylor93","fb-Berkeley13","fb-Bingham82","fb-Bowdoin47","fb-Brandeis99","fb-Brown11","fb-Bucknell39","fb-Cal65","fb-Caltech36","fb-Carnegie49","fb-Colgate88","fb-Columbia2","fb-Cornell5","fb-Dartmouth6","fb-Duke14","fb-Emory27","fb-FSU53","fb-GWU54","fb-Georgetown15","fb-Hamilton46","fb-Harvard1","fb-Haverford76","fb-Howard90","fb-Indiana69","fb-JMU79","fb-Lehigh96","fb-MIT8","fb-MSU24","fb-Maine59","fb-Maryland58","fb-Mich67","fb-Michigan23","fb-Middlebury45","fb-Mississippi66","fb-NYU9","fb-Northeastern19","fb-Northwestern25","fb-Oberlin44","fb-Oklahoma97","fb-Pepperdine86","fb-Princeton12","fb-Reed98","fb-Rice31","fb-Rochester38","fb-Rutgers89","fb-Santa74","fb-Simmons81","fb-Smith60","fb-Stanford3","fb-Swarthmore42","fb-Syracuse56","fb-Temple83","fb-Tennessee95","fb-Texas80","fb-Texas84","fb-Trinity100","fb-Tufts18","fb-Tulane29","fb-UC33","fb-UC61","fb-UC64","fb-UCF52","fb-UCLA26","fb-UCSB37","fb-UCSC68","fb-UCSD34","fb-UChicago30","fb-UConn91","fb-UF21","fb-UGA50","fb-UIllinois20","fb-UMass92","fb-UNC28","fb-UPenn7","fb-USC35","fb-USF51","fb-USFCA72","fb-Vanderbilt48","fb-Vassar85","fb-Vermont70","fb-Villanova62","fb-Virginia63","fb-Wake73","fb-WashU32","fb-Wellesley22","fb-Wesleyan43","fb-William77", "fb-Williams40", "fb-Wisconsin87", "fb-Yale4", "fb-_Amherst41"]

In [9]:
averageableProperties = [
#community
"randMeasure", "nmi", "ccAvgLocal", "modularity", "numCommunities", "numCommunitiesSim",
#degree distribution
"dd_spearman_rho", "dd_ks_d", "dd_relRankError",
#clustering coefficient
"ccGlobal", "ccAvgLocal", "cc_ks_d", "cc_perDegree_ks_d", "cc_spearman_rho", "cc_normalizedAbsDiff",
#weakly connected components
"wcc_nmi", "wcc_sizes_ks",
#pagerank
"pagerank_spearman_rho",
#diameter
"diameter"]

In [10]:
def getInterpolation(prop, algorithm, graph):
    df2 = df[(df.graph==graph) & (df.algorithm==algorithm)].sort(["edgeRatio"])
    edgeRatios = df2['edgeRatio']
    values = df2[prop]
    
    if algorithm == 'Original':
        return lambda x: values
    
    if len(edgeRatios) == 0 or len(values) == 0:
        return lambda x: exec('raise(ValueError("Missing data"))')
    
    if len(edgeRatios) < 2 or len(values) < 2:
        raise ValueError("arrays must have at least 2 entries.", prop,algorithm,graph)
    
    return interp1d(edgeRatios, values, kind='linear')

In [11]:
def aggregate(graphsToAggregate):
    rows = {}
    for algorithm in algorithms:        
        for prop in averageableProperties:
            functions_ratioToPropertyValue = []
            for graph in graphsToAggregate:
                functions_ratioToPropertyValue.append(getInterpolation(prop, algorithm, graph))
            
            for ratio in ters:
                myValues = []
                for interpFunction in functions_ratioToPropertyValue:
                    try:
                        interpolatedValue = float(interpFunction(ratio))
                        myValues.append(interpolatedValue)
                    except ValueError:
                        pass 
                
                if len(myValues) > 0 and (algorithm != 'Original' or ratio==1.0):
                    avgValue = np.average(myValues)
                    if not (algorithm, ratio) in rows:
                        rows[(algorithm, ratio)] = {}
                    rows[(algorithm, ratio)][prop] = avgValue
    return rows

In [12]:
def getRowDictionaries(aggregationResult, groupName):
    dictionaries = []
    for (algorithm, ratio) in aggregationResult:
        myDict = aggregationResult[(algorithm, ratio)]
        myDict['algorithm'] = algorithm
        myDict['targetEdgeRatio'] = ratio
        myDict['edgeRatio'] = ratio
        myDict['graph'] = groupName
        dictionaries.append(myDict)
    return dictionaries

In [14]:
aggregationResult = aggregate(graphGroupFacebook)
rowDictionaries = getRowDictionaries(aggregationResult, 'fb')
df = pd.concat([df, pd.DataFrame(rowDictionaries)])

In [15]:
aggregationResult = aggregate(graphGroupAll)
rowDictionaries = getRowDictionaries(aggregationResult, 'all')
df = pd.concat([df, pd.DataFrame(rowDictionaries)])

In [16]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc3_pandas.csv")

## Calulate a normalized distance measure for the number of communities

In [7]:
df[(df.graph=='fb-Caltech36') & (df.algorithm=='Local Similarity')]["numCommunitiesSim"]

4970    0.000007
4971    0.000007
4972    0.000529
4973    0.071621
4974    0.123575
4975    0.177769
4976    0.194687
4977    0.255729
4978    0.280067
4979    0.280067
4980    0.306721
4981    0.306721
4982    0.306721
4983    0.367879
Name: numCommunitiesSim, dtype: float64

In [13]:
for graph in graphGroupAll:
    originalCommunityCount = float(df[(df.graph=='fb-Caltech36') & (df.algorithm=='Original')]["numCommunities"])
    
    df['numCommunitiesSim'] = df['numCommunities'].map(lambda communityCount: originalCommunityCount/communityCount)

In [60]:
df.to_csv("/home/gerd/workspace/NetworKit-glindner/scripts/SparsificationEvaluation/output/backbones_paper_rc2_pandas.csv")