TODOS:

1) Think about new features
2) Setup experiments on salinas
3) 10-fold cross validation
4) Compute "compatibility" between regions for future superpixel clustering

In [1]:
import csv
import numpy as np
import scipy
import networkx as nx
import math
import sys
import os
import subprocess
import scipy.stats
import scipy.optimize
import operator
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

sys.path.append("../Python_code") # go to parent dir
from canvas_vis import * 
from analytics_combined import *
from generate_proj_to_remove import *
from project_data_analysis import *
from user_embedding import *
from segmentation import *
from evaluation import *
import pickle

In [2]:
#Determine the size of the canvas that is being looked at
min_x = 0
max_x = 1002
min_y = 0
max_y = 1002


In [3]:
projects_to_remove = get_list_of_removed_proj(output_filename = "../data/proj_to_remove.txt")

input_file= "../data/sorted_tile_placements_proj.csv"
js_filename = "../data/atlas_complete.json"

names, descriptions = read_picture_names_and_descriptions(js_filename)

In [4]:

#TODO: How far two vertices should be to be connected (1-4)?
G, ups = create_graph(input_file, projects_to_remove, 4, min_x, max_x, min_y, max_y)

print("num edges = ", G.n_edges)

num edges =  525656443


In [5]:
pfile = open('graph.pkl', 'wb')
pickle.dump(G, pfile)
pfile.close()

pfile = open('ups.pkl', 'wb')
pickle.dump(ups, pfile)
pfile.close()

In [4]:
pfile = open('graph.pkl', 'rb')
G = pickle.load(pfile)
pfile.close()

pfile = open('ups.pkl', 'rb')
ups = pickle.load(pfile)
pfile.close()

In [5]:
#Defining 7 edge features and computing the information they require
#Adding a new feature without changing the rest of the code should
#be easy.
#TODO: Are there other features that would improve the segmentation?
#TODO: How many dimensions we need?

def different_color(i, j, ups, data=None):
    if ups[i][4] == ups[j][4]:
        return 0
    else:
        return 1
        
def distance_space(i, j, ups, data=None):
    xi = ups[i][2]
    yi = ups[i][3]
    xj = ups[j][2]
    yj = ups[j][3]
    
    return np.sqrt(pow(xi-xj,2)+pow(yi-yj,2))

def distance_time(i, j, ups, data=None):
    time_i = ups[i][0]
    time_j = ups[j][0]
    
    return np.sqrt(pow(time_i-time_j,2))

def distance_duration(i, j, ups, durations):
    return dist_duration(durations[i], durations[j])

def distance_color(i, j, ups, conflicts):
    color_i = ups[i][4]
    color_j = ups[j][4]
    
    if color_i == color_j:
        return 0
    else:
        max_up = len(ups)
        dist = 0
        
        conf_i = []
        if conflicts[i][0] <= max_up:
            conf_i.append(ups[conflicts[i][0]][4])
            
        if conflicts[i][1] <= max_up:
            conf_i.append(ups[conflicts[i][1]][4])
        
        conf_j = []
        if conflicts[j][0] <= max_up:
            conf_j.append(ups[conflicts[j][0]][4])
            
        if conflicts[j][1] <= max_up:
            conf_j.append(ups[conflicts[j][1]][4])
        
        if color_i in conf_j:
            dist = dist + 1
            
        if color_j in conf_i:
            dist = dist + 1
        
        return dist
    
def distance_user_embedding(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return np.linalg.norm(data['emb'][user_i_id]-data['emb'][user_j_id])

def distance_user_colors(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return (1.-data['emb'][user_i_id].todense() * data['emb'][user_j_id].todense().T)[0,0]

In [8]:
conflicts = compute_update_conflicts(ups)
durations = compute_update_durations(ups)
user_color, user_index_color = compute_user_color(ups)

#TODO: We are currently using 40 dimensions, we might need more
# We also need to understand whether these other parameters matter.
ndim=40
threshold=10
total_samples=200
n_negatives=5
n_iterations=10
user_index, emb = embed_users(G, ups, ndim, threshold, total_samples, n_negatives, n_iterations)

features = [{'name': "different_color", 'func': different_color, 'data': None}, 
    {'name': "distance_space",  'func': distance_space, 'data': None}, 
    {'name': "distance_time", 'func': distance_time, 'data': None}, 
    {'name': "distance_duration", 'func': distance_duration, 'data': durations}, 
    {'name': "distance_color", 'func': distance_color, 'data': conflicts},
    {'name': "distance_user_embedding", 'func': distance_user_embedding, 'data': {'index': user_index, 'emb': emb}},
    {'name': "distance_user_colors", 'func': distance_user_colors, 'data': {'index': user_index_color, 'emb': user_color}}]

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 10 -s 200
reddit_place_project
signet
signet.tar.gz

avg pos =  0.3015506132026709 , n =  99078239
avg neg =  0.5931958323826662 , n =  8817972


In [9]:
pfile = open('features.pkl', 'wb')
pickle.dump(features, pfile)
pfile.close()

In [6]:
pfile = open('features.pkl', 'rb')
features = pickle.load(pfile)
pfile.close()

In [11]:
emb.shape

(1155655, 40)

In [15]:
avg_distances_pos_neg(emb)

avg pos =  0.7396222798030327 , n =  2041118
avg neg =  1.6404971017330785 , n =  157029


(0.7396222798030327, 1.6404971017330785)

In [None]:
print("test")

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 5 -s 100
avg pos =  1.1807355951313214 , n =  99078239
avg neg =  1.3348554803394002 , n =  8817972

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 80 -t 5 -s 100
avg pos =  1.1750889637369661 , n =  99078239
avg neg =  1.331160714429541 , n =  8817972

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 5 -s 150
avg pos =  0.5868004540189559 , n =  99078239
avg neg =  0.8949797397524855 , n =  8817972

os.system("python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 5 -s 200")

avg pos =  0.30873725836758875 , n =  99078239
avg neg =  0.60138161298838 , n =  8817972

os.system("python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 5 -s 300")

avg pos =  0.28452876787163384 , n =  99078239
avg neg =  0.57960637526801 , n =  8817972

os.system("python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 80 -t 5 -s 300")

avg pos =  0.25392424705396205 , n =  99078239
avg neg =  0.5389355038583881 , n =  8817972

os.system("python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 100 -t 5 -s 300")

avg pos =  0.246320305975872 , n =  99078239
avg neg =  0.529323314020317 , n =  8817972

In [None]:
metric_vals = validate_best_model(evaluate, ups, G, features, input_file, projects_to_remove,'recall', min_x, min_y, max_x, max_y, load_models = True, load_segmentation = True)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=25,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_sample

In [8]:
print(metric_vals)
print("AVG:",(sum(metric_vals)/len(metric_vals)))

NameError: name 'metric_vals' is not defined

In [None]:
def validate_best_model_wrapper(parameters):

    vertex_lengths = parameters.get("v_lengths")
    projects_to_remove = parameters.get("projects_to_remove")
    input_filename = parameters.get("input_filename")
    min_x = parameters.get("min_x")
    max_x = parameters.get("max_x")
    min_y = parameters.get("min_y")
    max_y = parameters.get("max_y")

    n_dims = parameters.get("n_dims")
    thresholds = parameters.get("thresholds")
    total_samples = parameters.get("total_samples")
    n_negatives = parameters.get("n_negatives")
    n_iterations = parameters.get("n_iterations")

    kappa_vals = parameters["kappas"]
    validation_metrics = parameters("validation_metrics")

    best_score = -1
    best_parameters = None

    for v in vertex_lengths:
        G, ups = create_graph(input_file, projects_to_remove,
                              v, min_x, max_x, min_y, max_y)

        for ndim in n_dims:
            for threshold in thresholds:
                for total_sample in total_samples:
                    for n_negative in n_negatives:
                        for n_iteration in n_iterations:
                            user_index, emb = embed_users(G, ups, ndim, threshold, total_samples, n_negatives, n_iterations)

                            features = [{'name': "different_color", 'func': different_color, 'data': None},
                                        {'name': "distance_space",
                                            'func': distance_space, 'data': None},
                                        {'name': "distance_time",
                                            'func': distance_time, 'data': None},
                                        {'name': "distance_duration",
                                         'func': distance_duration, 'data': durations},
                                        {'name': "distance_color",
                                         'func': distance_color, 'data': conflicts},
                                        {'name': "distance_user_embedding", 'func': distance_user_embedding, 'data': {
                                            'index': user_index, 'emb': emb}},
                                        {'name': "distance_user_colors", 'func': distance_user_colors, 'data': {'index': user_index_color, 'emb': user_color}}]



                            

                            for kappa in kappa_vals:
                                for metric in validation_metrics:
                                    metric_vals = validate_best_model(evaluate, ups, G, features, input_file, projects_to_remove, metric, min_x, min_y, max_x, max_y)

                                    avg_score = sum(metric_vals) / len(metric_vals)
                                    if avg_score > best_score:
                                        best_score = avg_score
                                        best_parameters = { 
                                            "v_length": v, 
                                            "ndim" : ndim, 
                                            "threshold": threshold, 
                                            "total_sample":total_sample, 
                                            "n_negative" : n_negative,
                                            "n_iteration" : n_iteration,
                                            "kappa" : kappa,
                                            "metric" : metric,
                                            "avg_score" : avg_scores
                                            }

    return best_parameters

In [None]:
validation_parameters = {
    "v_lengths" : [1,2,3,4],
    "projects_to_remove" : projects_to_remove,
    "input_filename" : input_filename,
    "min_x" : min_x,
    "max_x" : max_x,
    "min_y" : min_y,
    "max_y" : max_y,
    "n_dims" : [40],
    "thresholds" : [10],
    "total_samples" : [200],
    "n_negatives" : [5],
    "n_iterations" : [10],
    "kappas" : [0.25],
    "validation_metrics" : ['recall']
}
best_model = validate_best_model_wrapper(validation_parameters)


In [7]:
# from sklearn.ensemble import GradientBoostingRegressor

# A,b = build_feat_label_data(G, ups, features)
# model = GradientBoostingRegressor(random_state=1, n_estimators=25).fit(A, b)

In [8]:
pfile = open('model.pkl', 'wb')
pickle.dump(model, pfile)
pfile.close()

In [7]:
pfile = open('model.pkl', 'rb')
model = pickle.load(pfile)
pfile.close()

In [13]:
#Feature statistics

def feature_weight_statistics(G, ups, model, features, projects=None, regions=None):
    '''
    '''
    statistics = {}
    
    region_check = {}
    
    if regions != None:
        for r in regions:
            for u in r:
                region_check[u] = 1
    
    for f in range(len(features)):
        #avg, min, max weights
        sum_weights_in = 0
        min_weight_in = 1e10
        max_weight_in = 0
    
        sum_weights_out = 0
        min_weight_out = 1e10
        max_weight_out = 0
        n_in = 0
        n_out = 0
    
        with open(G.unique_edges_file_name, 'r') as file_in:
            reader = csv.reader(file_in)
    
            for r in reader:
                u = int(r[0])
                v = int(r[1])
                proj_u = ups[u][5]
                proj_v = ups[v][5]
                pixel_u = int(ups[u][6])
                pixel_v = int(ups[v][6])
                
                w = features[f]['func'](u, v, ups, features[f]['data'])
                
                if pixel_u == 1 and pixel_v == 1 and regions is None or (u in region_check and v in region_check):
                    if projects is None or (proj_u in projects and proj_v in projects):
                        if proj_u == proj_v:
                            sum_weights_in = sum_weights_in + w
                            n_in = n_in + 1
            
                            if w < min_weight_in:
                                min_weight_in = w
                
                            if w > max_weight_in:
                                max_weight_in = w
                        else:
                            sum_weights_out = sum_weights_out + w
                            n_out = n_out + 1
            
                            if w < min_weight_out:
                                min_weight_out = w
                
                            if w > max_weight_out:
                                max_weight_out = w                
                
        statistics[features[f]['name']] = {}
        
        statistics[features[f]['name']]['inside avg'] = sum_weights_in / n_in
        statistics[features[f]['name']]['inside min'] = min_weight_in
        statistics[features[f]['name']]['inside max'] = max_weight_in
        statistics[features[f]['name']]['inside count'] =  n_in
    
        statistics[features[f]['name']]['outside avg'] = sum_weights_out / n_out
        statistics[features[f]['name']]['outside min'] = min_weight_out
        statistics[features[f]['name']]['outside max'] = max_weight_out
        statistics[features[f]['name']]['outside count'] =  n_out
    
        statistics[features[f]['name']]['all avg'] = (sum_weights_in+sum_weights_out) / (n_in + n_out)
        statistics[features[f]['name']]['all min'] = min(min_weight_in, min_weight_out)
        statistics[features[f]['name']]['all max'] = max(max_weight_in, max_weight_out)
        statistics[features[f]['name']]['all count'] =  n_in + n_out
        
        
    sum_weights_in = 0
    min_weight_in = 1e10
    max_weight_in = 0
    
    sum_weights_out = 0
    min_weight_out = 1e10
    max_weight_out = 0
    n_in = 0
    n_out = 0
    
    with open(G.unique_edges_file_name, 'r') as file_in:
        reader = csv.reader(file_in)
    
        for r in reader:
            u = int(r[0])
            v = int(r[1])
            proj_u = ups[u][5]
            proj_v = ups[v][5]
            pixel_u = int(ups[u][6])
            pixel_v = int(ups[v][6])
                
            w = compute_weight(int(u), int(v), ups, model, features)
                
            if pixel_u == 1 and pixel_v == 1 and regions is None or (u in region_check and v in region_check):
                if projects is None or (proj_u in projects and proj_v in projects):
                    if proj_u == proj_v:
                        sum_weights_in = sum_weights_in + w
                        n_in = n_in + 1
            
                        if w < min_weight_in:
                            min_weight_in = w
                
                        if w > max_weight_in:
                            max_weight_in = w
                    else:
                        sum_weights_out = sum_weights_out + w
                        n_out = n_out + 1
            
                        if w < min_weight_out:
                            min_weight_out = w
                
                        if w > max_weight_out:
                            max_weight_out = w                
                
    statistics['weight'] = {}
        
    statistics['weight']['inside avg'] = sum_weights_in / n_in
    statistics['weight']['inside min'] = min_weight_in
    statistics['weight']['inside max'] = max_weight_in
    statistics['weight']['inside count'] =  n_in
    
    statistics['weight']['outside avg'] = sum_weights_out / n_out
    statistics['weight']['outside min'] = min_weight_out
    statistics['weight']['outside max'] = max_weight_out
    statistics['weight']['outside count'] =  n_out
    
    statistics['weight']['all avg'] = (sum_weights_in+sum_weights_out) / (n_in + n_out)
    statistics['weight']['all min'] = min(min_weight_in, min_weight_out)
    statistics['weight']['all max'] = max(max_weight_in, max_weight_out)
    statistics['weight']['all count'] =  n_in + n_out
    
    return statistics

In [14]:
feature_weight_statistics(G, ups, model, features, projects=None, regions=None)

{'different_color': {'all avg': 0.5506835397020043,
  'all count': 2948768,
  'all max': 1,
  'all min': 0,
  'inside avg': 0.49820972518183027,
  'inside count': 2435101,
  'inside max': 1,
  'inside min': 0,
  'outside avg': 0.7994420509785523,
  'outside count': 513667,
  'outside max': 1,
  'outside min': 0},
 'distance_color': {'all avg': 0.20388277409413016,
  'all count': 2948768,
  'all max': 2,
  'all min': 0,
  'inside avg': 0.1897297073098816,
  'inside count': 2435101,
  'inside max': 2,
  'inside min': 0,
  'outside avg': 0.27097711163068683,
  'outside count': 513667,
  'outside max': 2,
  'outside min': 0},
 'distance_duration': {'all avg': 0.4178737626877978,
  'all count': 2948768,
  'all max': 1.0,
  'all min': 0.0,
  'inside avg': 0.4081432360968296,
  'inside count': 2435101,
  'inside max': 1.0,
  'inside min': 0.0,
  'outside avg': 0.4640025095845095,
  'outside count': 513667,
  'outside max': 1.0,
  'outside min': 0.0},
 'distance_space': {'all avg': 3.444402414

In [5]:
#Feature importances

for f in range(len(features)):
    print("feature: ", features[f]['name'], model.feature_importances_[f])

feature:  different_color 0.6011497753202893
feature:  distance_space 0.17206012211539523
feature:  distance_time 0.0865242392379522
feature:  distance_duration 0.00021950883112781502
feature:  distance_color 0.00747805169393991
feature:  distance_user_embedding 0.0964976420028267
feature:  distance_user_colors 0.03607066079846882


In [None]:
compute_edge_weights_multithread(G, ups, model, features, 10)

In [21]:
import pickle

def compute_weight_wrapper(param):
    '''
        Simple wrapper for the compute_weight function
    '''    
    #Loading pickled features
    pfile = open('features.pkl', 'rb')
    features = pickle.load(pfile)
    pfile.close()

    return compute_weight(param[0], param[1], param[2], features)

def compute_weight_multithread(edge_buffer, ups, model, n_threads):
    '''
        Computes weights for set of edges in edge_buffer using multithreading
    '''

    #Dividing the work
    edges_per_thread = int(len(edge_buffer) / n_threads)

    edge_parts = []
    for t in range(n_threads):
        edge_parts.append([])

    e = 0
    for e in range(len(edge_buffer)):
        t = e % n_threads
        edge_parts[t].append(edge_buffer[e])

    futures = []

    #Multithreading
    with concurrent.futures.ProcessPoolExecutor(max_workers=n_threads) as executor:
        for t in range(n_threads):
            fut = executor.submit(compute_weight_wrapper, (edge_parts[t], ups, model))
            futures.append(fut)

    W = np.zeros(len(edge_buffer))

    for t in range(n_threads):
        fut = futures[t]
        res = fut.result()
        for e in range(res.shape[0]):
            W[e*n_threads+t] = res[e]

    return W

def compute_edge_weights_multithread(G, ups, model, features, n_threads):
    '''
        Computes weights for edges in the graph using multithreading.
    '''

    if os.path.exists(G.edges_file_name):
        os.remove(G.edges_file_name)
    
    #Pickling feature data
    pfile = open('features.pkl', 'wb')
    pickle.dump(features, pfile)
    pfile.close()
        
    edge_buffer = []

    with open(G.unique_edges_file_name, 'r') as file_in:
        reader = csv.reader(file_in)

        for r in reader:
            u = r[0]
            v = r[1]
            lb = r[2]
            type_edge = int(r[3])

            if type_edge > 0:
                edge_buffer.append((int(u), int(v), lb, type_edge))

                if len(edge_buffer) >= G.buffer_size:

                    W = compute_weight_multithread(edge_buffer, ups, model, n_threads)

                    for e in range(len(edge_buffer)):
                        u = edge_buffer[e][0]
                        v = edge_buffer[e][1]
                        lb = edge_buffer[e][2]
                        type_edge = edge_buffer[e][3]
                        w = W[e]

                        G.set_weight(u, v, lb, type_edge, w)

                    edge_buffer = []

    if len(edge_buffer) > 0:
        W = compute_weight_multithread(edge_buffer, ups, model, features, n_threads)

        for e in range(len(edge_buffer)):
            u = edge_buffer[e][0]
            v = edge_buffer[e][1]
            lb = edge_buffer[e][2]
            type_edge = edge_buffer[e][3]
            w = W[e]

            G.set_weight(u, v, lb, type_edge, w)

    G.flush_weights()

In [22]:
compute_edge_weights_multithread(G, ups, model, features, 5)

Running process:  45252
Running process:  45259
Running process:  45269
Running process:  45273
Running process:  45276
Running process:  4083
Running process:  4085
Running process:  4086
Running process:  4088
Running process:  4089
Running process:  8106
Running process:  8109
Running process:  8111
Running process:  8114
Running process:  8116
Running process:  12167
Running process:  12170
Running process:  12173
Running process:  12175
Running process:  12178
Running process:  16390
Running process:  16393
Running process:  16395
Running process:  16397
Running process:  16401
Running process:  20770
Running process:  20773
Running process:  20776
Running process:  20778
Running process:  20781
Running process:  25013
Running process:  25015
Running process:  25018
Running process:  25021
Running process:  25024
Running process:  29737
Running process:  29740
Running process:  29742
Running process:  29745
Running process:  29747
Running process:  34488
Running process:  34491
Ru

TypeError: compute_weight_multithread() takes 4 positional arguments but 5 were given

In [23]:
G.sort_edges()

In [24]:
comp_assign = region_segmentation(G, ups, .25)
regions, sizes = extract_regions(comp_assign)

print("num regions = ", len(regions), " max size region = ", np.max(sizes))

num regions =  1699614  max size region =  17405


In [None]:
# Find the percentage that each ground truth image overlaps with the region
locations = store_locations("../data/atlas_complete.json")
ground_truth = create_ground_truth(input_file,0, sys.maxsize, min_x, max_x, min_y, max_y)

In [None]:
overlap_statistics = compute_overlap_area(locations, regions[0], ups, ground_truth)

In [2]:
G

NameError: name 'G' is not defined

In [1]:
#Keeping only final pixels from the regions

pixel_assign = assign_pixels(comp_assign, ups)
pixel_regions, pixel_sizes = extract_regions(pixel_assign)

NameError: name 'assign_pixels' is not defined

In [69]:
draw_top_regions(pixel_regions, pixel_sizes, 10)

In [68]:
draw_region(ups, extract_region(comp_assign, 1017523), "../plots/region.svg")

In [41]:
js_filename = "../data/atlas_complete.json"

names, descriptions = read_picture_names_and_descriptions(js_filename)


def region_statistics(G, region, ups):
    '''
        Prints a few region statistics.
    '''
    #avg, min, max weights
    sum_weights_in = 0
    min_weight_in = 1e10
    max_weight_in = 0
    
    sum_weights_out = 0
    min_weight_out = 1e10
    max_weight_out = 0
    n_in = 0
    n_out = 0
    
    n_edges = 0
    region_check = {}
    
    for u in region:
        region_check[u] = 1
    
    with open(G.sorted_edges_file_name, 'r') as file_in:
        reader = csv.reader(file_in)
    
        for r in reader:
            u = int(r[0])
            v = int(r[1])
            w = float(r[4])
            
            if u in region_check and v in region_check:
                n_edges = n_edges + 1
                sum_weights_in = sum_weights_in + w
                n_in = n_in + 1
            
                if w < min_weight_in:
                    min_weight_in = w
                
                if w > max_weight_in:
                    max_weight_in = w
                    
            elif u in region_check or v in region_check:
                sum_weights_out = sum_weights_out + w
                n_out = n_out + 1
            
                if w < min_weight_out:
                    min_weight_out = w
                
                if w > max_weight_out:
                    max_weight_out = w
    
    color_dist = np.zeros(16)
    n_pixel = 0
    n_pixel_color = 0
    min_time = sys.maxsize
    max_time = 0
    projs = {}
    for u in region:
        if ups[u][6] == 1:
            n_pixel = n_pixel + 1
            if ups[u][5] not in projs:
                projs[ups[u][5]] = 0
            
            projs[ups[u][5]] = projs[ups[u][5]] + 1
        if ups[u][7] == 1:
            n_pixel_color = n_pixel_color + 1
            
        if ups[u][0] > max_time:
            max_time = ups[u][0]
            
        if ups[u][0] < min_time:
            min_time = ups[u][0]
        
        color_dist[int(ups[u][4])] = color_dist[int(ups[u][4])] + 1
    
    print("num updates = ", len(region))
    print("num_edges = ", n_edges)
    print("num pixels = ", n_pixel, " (", 100 * n_pixel / len(region), "%)")
    print("num_pixel_colors = ", n_pixel_color, " (", 100 * n_pixel / len(region), "%)")
    
    print()
    
    sorted_projs = sorted(projs.items(), key=operator.itemgetter(1), reverse=True)
    
    for i in range(len(sorted_projs)):
        print("proj ", names[int(sorted_projs[i][0])], " #final updates = ", sorted_projs[i][1], " (",
              100 * sorted_projs[i][1] / n_pixel, "% of final, ", 100 * sorted_projs[i][1] / len(region), " of total)")
    
    print()
    
    print("colors: ", list(color_dist))
    
    print()
    
    print("duration: ", min_time, " - ", max_time, " (", (max_time-min_time)/ 1000, " seconds)")
        
    print()
    
    print("avg weight inside = ",  sum_weights_in / n_in)
    print ("min weight inside = ", min_weight_in)
    print("max weight inside = ", max_weight_in)
    print("#edges inside = ", n_in)
    
    print()
    
    print("avg weight outside = ",  sum_weights_out / n_out)
    print ("min weight outside = ", min_weight_out)
    print("max weight outside = ", max_weight_out)
    print("#edges outside = ", n_out)

In [77]:
region_statistics(G, extract_region(comp_assign, 1017523), ups)

num updates =  3497
num_edges =  34238
num pixels =  333  ( 9.522447812410638 %)
num_pixel_colors =  1647  ( 9.522447812410638 %)

proj  Rickroll QR code  #final updates =  161  ( 48.348348348348345 % of final,  4.603946239633972  of total)
proj  Meat Boy  #final updates =  145  ( 43.54354354354354 % of final,  4.146411209608235  of total)
proj  RuneScape disconnected message  #final updates =  27  ( 8.108108108108109 % of final,  0.77209036316843  of total)

colors:  [651.0, 3.0, 0.0, 2624.0, 5.0, 183.0, 3.0, 10.0, 9.0, 0.0, 5.0, 0.0, 0.0, 4.0, 0.0, 0.0]

duration:  1491075060000  -  1491238585000  ( 163525.0  seconds)

avg weight inside =  0.07799586747581512
min weight inside =  0.0101650354992
max weight inside =  0.609298714302
#edges inside =  34238

avg weight outside =  0.1759484575053703
min weight outside =  0.012011047842
max weight outside =  0.623526170427
#edges outside =  195866


In [75]:
def project_statistics(G, comp_assign, ups, proj):
    '''
        Prints a few region statistics.
    '''
    #avg, min, max weights
    sum_weights_in = 0
    min_weight_in = 1e10
    max_weight_in = 0
    
    sum_weights_out = 0
    min_weight_out = 1e10
    max_weight_out = 0
    n_in = 0
    n_out = 0
    
    n_edges = 0
    regions = {}
    G_proj = nx.Graph()
    
    with open(G.sorted_edges_file_name, 'r') as file_in:
        reader = csv.reader(file_in)
    
        for r in reader:
            u = int(r[0])
            v = int(r[1])
            w = float(r[4])
            
            if ups[u][6] == 1 and ups[v][6] == 1:
                if ups[u][5] == proj and ups[v][5] == proj:
                    G_proj.add_edge(u,v)
                    n_edges = n_edges + 1
                    sum_weights_in = sum_weights_in + w
                    n_in = n_in + 1

                    if w < min_weight_in:
                        min_weight_in = w

                    if w > max_weight_in:
                        max_weight_in = w

            elif (ups[u][6] == 1 and ups[u][5] == proj) or (ups[u][6] == 1 and ups[v][5] == proj):
                sum_weights_out = sum_weights_out + w
                n_out = n_out + 1

                if w < min_weight_out:
                    min_weight_out = w

                if w > max_weight_out:
                    max_weight_out = w
                    
    size_proj = 0
    for u in range(len(ups)):
        if ups[u][6] == 1 and ups[u][5] == proj:
            size_proj = size_proj + 1
            if comp_assign[u] not in regions:
                regions[comp_assign[u]] = 0
                
            regions[comp_assign[u]] = regions[comp_assign[u]] + 1
            
    region_sizes = {}
    
    for r in regions:
        region_sizes[r] = 0
        
    for u in range(len(ups)):
        if comp_assign[u] in regions and ups[u][6] == 1:
            region_sizes[comp_assign[u]] = region_sizes[comp_assign[u]] + 1
    
    sorted_regions = sorted(regions.items(), key=operator.itemgetter(1), reverse=True)
    
    for i in range(len(sorted_regions)):
        print("region ", sorted_regions[i][0], " #updates = ", sorted_regions[i][1], " (",
              100 * sorted_regions[i][1] / size_proj, "% of project, ", 
              100 * sorted_regions[i][1] / region_sizes[sorted_regions[i][0]], "% of region)")
    
    print()
    
    print("avg weight inside = ",  sum_weights_in / n_in)
    print ("min weight inside = ", min_weight_in)
    print("max weight inside = ", max_weight_in)
    print("#edges inside = ", n_in)
    
    print()
    
    print("avg weight outside = ",  sum_weights_out / n_out)
    print ("min weight outside = ", min_weight_out)
    print("max weight outside = ", max_weight_out)
    print("#edges outside = ", n_out)
    
    print("Graph connected : ", nx.is_connected(G_proj))
    print("Largest connected component: ", 100 * max(nx.connected_component_subgraphs(G_proj), key=len).number_of_nodes() / size_proj, "%")

In [76]:
 project_statistics(G, comp_assign, ups, '339')

region  1017523  #updates =  145  ( 33.25688073394495 % of project,  43.54354354354354 % of region)
region  889685  #updates =  142  ( 32.56880733944954 % of project,  94.03973509933775 % of region)
region  992676  #updates =  68  ( 15.596330275229358 % of project,  31.48148148148148 % of region)
region  1080977  #updates =  48  ( 11.009174311926605 % of project,  97.95918367346938 % of region)
region  1020554  #updates =  7  ( 1.6055045871559632 % of project,  100.0 % of region)
region  875891  #updates =  4  ( 0.9174311926605505 % of project,  100.0 % of region)
region  1022053  #updates =  3  ( 0.6880733944954128 % of project,  100.0 % of region)
region  960835  #updates =  2  ( 0.45871559633027525 % of project,  100.0 % of region)
region  1046798  #updates =  2  ( 0.45871559633027525 % of project,  100.0 % of region)
region  1029455  #updates =  2  ( 0.45871559633027525 % of project,  100.0 % of region)
region  1090120  #updates =  2  ( 0.45871559633027525 % of project,  100.0 % of

In [None]:
def weight_statistics(G, ups, projs):
    '''
        Computes weight statistics for the weights within 
        and across projects.
    '''
    
    sum_in_final = 0
    sum_out_final = 0
    n_in_final = 0
    n_out_final = 0
    
    
    sum_in_color = 0
    sum_out_color = 0
    n_in_color = 0
    n_out_color = 0
    
    with open(G.sorted_edges_file_name, 'r') as file_in:
        reader = csv.reader(file_in)
    
        for r in reader:
            u = int(r[0])
            v = int(r[1])
            w = float(r[4])
            type_edge = int(r[3])
            proj_u = ups[u][5]
            proj_v = ups[v][5]
            
            if type_edge > 0:
                if projs is None or (proj_u in projs and proj_v in projs):
                    if ups[u][6] == 1 and ups[v][6] == 1:
                        if ups[u][5] == ups[v][5]:
                            sum_in_final = sum_in_final + w
                            n_in_final = n_in_final + 1
                        else:
                            sum_out_final = sum_out_final + w
                            n_out_final = n_out_final + 1

                    if ups[u][7] == 1 and ups[v][7] == 1:
                        if ups[u][5] == ups[v][5]:
                            sum_in_color = sum_in_color + w
                            n_in_color = n_in_color + 1
                        else:
                            sum_out_color = sum_out_color + w
                            n_out_color = n_out_color + 1
                            
                    
                    
    print("avg weight inside projects (pixel) = ", sum_in_final / n_in_final)
    print("avg weight outside projects (pixel) = ", sum_out_final / n_out_final)
    print("avg weight (pixel) = ", (sum_in_final + sum_out_final) / (n_in_final + n_out_final) )
    
    print()
    
    print("avg weight inside projects (color) = ", sum_in_color / n_in_color)
    print("avg weight outside projects (color) = ", sum_out_color / n_out_color)
    print("avg weight (color) = ", (sum_in_color + sum_out_color) / (n_in_color + n_out_color) )
    

In [None]:
weight_statistics(G, ups, ['339', '241'])

In [None]:
G_proj, node_colors, edge_colors = proj_graph(G, ups, ['339', '241'], True)

In [None]:
#Computing the training and test MSE of the regression method,
#Something similar can be done in the cross validation.

train_perc = 0.5
n_samples = A.shape[0]
n_samples_train = int(n_samples * train_perc)

shuff = np.arange(n_samples)
np.random.shuffle(shuff)

A = A[shuff]
b = b[shuff]

m = learn_model(A[0:n_samples_train,:],b[0:n_samples_train], 'gb')

print("training MSE = ", ((b[:n_samples_train]- m.predict(A[0:n_samples_train,:]))**2).mean(axis=0) )

print("test MSE = ", ((b[n_samples_train:]- m.predict(A[n_samples_train:,:]))**2).mean(axis=0) )