In [1]:
import csv
import numpy as np
import scipy
import networkx as nx
import math
import sys
import os
import subprocess
import scipy.stats
import scipy.optimize
import operator
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

sys.path.append("../Python_code") # go to parent dir
from canvas_vis import * 
from analytics_combined import *
from generate_proj_to_remove import *
from project_data_analysis import *
from user_embedding import *
from segmentation import *
from evaluation import *
import pickle

Using TensorFlow backend.


In [2]:
#Determine the size of the canvas that is being looked at
min_x = 0
max_x = 1002
min_y = 0
max_y = 1002

In [3]:
projects_to_remove = get_list_of_removed_proj(output_filename = "../data/proj_to_remove.txt")

input_file= "../data/sorted_tile_placements_proj.csv"
js_filename = "../data/atlas_complete.json"

names, descriptions = read_picture_names_and_descriptions(js_filename)

In [None]:
ground_truth = create_ground_truth(input_file, min_x=min_x, max_x=max_x, min_y=min_y, max_y=max_y, projects_to_remove=projects_to_remove)

In [4]:
# Features

def different_color(i, j, ups, data=None):
    if ups[i][4] == ups[j][4]:
        return 0
    else:
        return 1
        
def distance_space(i, j, ups, data=None):
    xi = ups[i][2]
    yi = ups[i][3]
    xj = ups[j][2]
    yj = ups[j][3]
    
    return np.sqrt(pow(xi-xj,2)+pow(yi-yj,2))

def distance_time(i, j, ups, data=None):
    time_i = ups[i][0]
    time_j = ups[j][0]
    
    return np.sqrt(pow(time_i-time_j,2))

def distance_duration(i, j, ups, durations):
    return dist_duration(durations[i], durations[j])

def distance_color(i, j, ups, conflicts):
    color_i = ups[i][4]
    color_j = ups[j][4]
    
    if color_i == color_j:
        return 0
    else:
        max_up = len(ups)
        dist = 0
        
        conf_i = []
        if conflicts[i][0] <= max_up:
            conf_i.append(ups[conflicts[i][0]][4])
            
        if conflicts[i][1] <= max_up:
            conf_i.append(ups[conflicts[i][1]][4])
        
        conf_j = []
        if conflicts[j][0] <= max_up:
            conf_j.append(ups[conflicts[j][0]][4])
            
        if conflicts[j][1] <= max_up:
            conf_j.append(ups[conflicts[j][1]][4])
        
        if color_i in conf_j:
            dist = dist + 1
            
        if color_j in conf_i:
            dist = dist + 1
        
        return dist
    
def distance_user_embedding(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return np.linalg.norm(data['emb'][user_i_id]-data['emb'][user_j_id])

def distance_user_colors(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return (1.-data['emb'][user_i_id].todense() * data['emb'][user_j_id].todense().T)[0,0]

NOTE: ALL file_prefix strings must NOT contain any underscore '\_' characters

In [5]:
def create_graph_and_updatelist(vertex_connectivity, file_prefix):
    '''
        Either create the Graph and list of updates with the given *vertex_distance* 
        or load them from the pickle file if the already exist
    '''
    G = None
    ups = []
    if not os.path.exists(file_prefix + 'graph.pkl') or not os.path.exists(file_prefix + 'ups.pkl'):
        G, ups = create_graph(input_file, projects_to_remove, vertex_connectivity, min_x, max_x, min_y, max_y, file_prefix=file_prefix)

        print("num edges = ", G.n_edges)
        pfile = open(file_prefix + 'graph.pkl', 'wb')
        pickle.dump(G, pfile)
        pfile.close()

        pfile = open(file_prefix + 'ups.pkl', 'wb')
        pickle.dump(ups, pfile)
        pfile.close()
    else:
        pfile = open(file_prefix + 'graph.pkl', 'rb')
        G = pickle.load(pfile)
        pfile.close()

        pfile = open(file_prefix + 'ups.pkl', 'rb')
        ups = pickle.load(pfile)
        pfile.close()
    
    return G, ups

In [6]:
def create_features(G, ups, file_prefix):
    features = []
    features_filename = file_prefix + 'features.pkl'
    if not os.path.exists(features_filename):

        conflicts = compute_update_conflicts(ups)
        durations = compute_update_durations(ups)
        user_color, user_index_color = compute_user_color(ups)

        #TODO: We are currently using 40 dimensions, we might need more
        # We also need to understand whether these other parameters matter.
        ndim=40
        threshold=10
        total_samples=200
        n_negatives=5
        n_iterations=10
        user_index, emb = embed_users(G, ups, ndim, threshold, total_samples, n_negatives, n_iterations)

        features = [{'name': "different_color", 'func': different_color, 'data': None}, 
            {'name': "distance_space",  'func': distance_space, 'data': None}, 
            {'name': "distance_time", 'func': distance_time, 'data': None}, 
            {'name': "distance_duration", 'func': distance_duration, 'data': durations}, 
            {'name': "distance_color", 'func': distance_color, 'data': conflicts},
            {'name': "distance_user_embedding", 'func': distance_user_embedding, 'data': {'index': user_index, 'emb': emb}},
            {'name': "distance_user_colors", 'func': distance_user_colors, 'data': {'index': user_index_color, 'emb': user_color}}]

        pfile = open(features_filename, 'wb')
        pickle.dump(features, pfile)
        pfile.close()
    else:
        pfile = open(features_filename, 'rb')
        features = pickle.load(pfile)
        pfile.close()
    
    return features

In [7]:
def evaluate_vertex_distance(min = 1, max = 4):
    for vertex_connectivity in range(min, max+1):
        file_prefix = "dist" + str(vertex_connectivity)
        
        G, ups = create_graph_and_updatelist(vertex_connectivity, file_prefix)
        features = create_features(G, ups, file_prefix)
        
        metric_vals = validate_best_model(evaluate, ups, G, features, input_file, projects_to_remove,'recall', ground_truth, min_x, min_y, max_x, max_y, file_prefix=file_prefix, compute_edge_weights = False, load_segmentation = True, load_models = True)
        print(metric_vals)
        print("AVG for vertex distance " , distance,":",(sum(metric_vals)/len(metric_vals)))

In [None]:
evaluate_vertex_distance()

num edges =  67201881
python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 10 -s 200
reddit_place_project
signet
signet.tar.gz

avg pos =  0.9041201576605967 , n =  18622078
avg neg =  1.363618305312599 , n =  9455122
0.23026315789473684
0.29850746268656714
0.30303030303030304
0.27979274611398963
0.4539877300613497
0.23026315789473684
0.29850746268656714
0.30303030303030304
0.27979274611398963
0.4539877300613497
0.2608695652173913
0.2184873949579832
0.30718954248366015
0.3104693140794224
0.21428571428571427
[0.23026315789473684, 0.29850746268656714, 0.30303030303030304, 0.27979274611398963, 0.4539877300613497, 0.23026315789473684, 0.29850746268656714, 0.30303030303030304, 0.27979274611398963, 0.4539877300613497, 0.2608695652173913, 0.2184873949579832, 0.30718954248366015, 0.3104693140794224, 0.21428571428571427]
AVG for vertex distance  <module 'scipy.spatial.distance' from '/cs/student/danielshu/.local/lib/python3.6/site-packages/scipy/spatial/distance.py'> 

In [None]:
def evaluate_kappas(kappas = [0.2,0.5,0.8,1.0,1.3,1.5,1.9]):
    '''
        Evaluate the precision and recall given different values of kappa
        
        This code assumes that evaluate_vertex_distance() has already been called at least once, which
        will create and pickle the different models
    '''
    for kappa in kappas:
        # For this test, we will fix vertex connectivity at 1
        file_prefix = "dist1"
        
        G, ups = create_graph_and_updatelist(1, file_prefix)
        features = create_features(G, ups, file_prefix)
        
        metric_vals = validate_best_model(evaluate, ups, G, features, input_file, projects_to_remove,'recall', ground_truth, min_x, min_y, max_x, max_y, file_prefix=file_prefix, kappa = kappa, compute_edge_weights = False, load_segmentation = True, load_models = True)
        print(metric_vals)
        print("AVG for kappa values " , kappa,":",(sum(metric_vals)/len(metric_vals)))
        

In [None]:
evaluate_kappas()

In [None]:
def evaluate_modeltypes():
    ''' 
        Evaluate which modeltype is better
            modeltype = 0 will use a sklearn.ensemble.GradientBoostingRegressor
            modeltype = 1 will create a keras.models.Sequential neural network
            
    '''
    modeltypes = [0,1]
    for modeltype in modeltypes:
        # For this test, we will fix vertex connectivity at 1 and fix kappa at 0.3
        kappa = 0.3
        file_prefix = "dist1"
        
        G, ups = create_graph_and_updatelist(1, file_prefix)
        features = create_features(G, ups, file_prefix)
        metric_vals = validate_best_model(evaluate, ups, G, features, input_file, projects_to_remove,'recall', ground_truth, min_x, min_y, max_x, max_y, file_prefix=file_prefix, kappa = kappa, compute_edge_weights = True, load_segmentation = True, load_models = True, modeltype = modeltype)
        print(metric_vals)
        print("AVG for kappa values " , kappa,":",(sum(metric_vals)/len(metric_vals)))
        
        

In [None]:
evaluate_modeltypes()