Determine whether to use the GradientBoostingRegressor from sklearn or a Neural Network.
Evaluate which model gets better results

In [1]:
import csv
import numpy as np
import scipy
import networkx as nx
import math
import sys
import os
import subprocess
import scipy.stats
import scipy.optimize
import operator
from sklearn.manifold import TSNE
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

sys.path.append("../Python_code") # go to parent dir
from canvas_vis import * 
from analytics_combined import *
from generate_proj_to_remove import *
from project_data_analysis import *
from user_embedding import *
from segmentation import *
from evaluation import *
from nonlinear_regressor import *
import pickle

Using TensorFlow backend.


In [2]:
#Determine the size of the canvas that is being looked at
min_x = 0
max_x = 10
min_y = 0
max_y = 10


In [3]:
projects_to_remove = get_list_of_removed_proj(output_filename = "../data/proj_to_remove.txt")

input_file= "../data/sorted_tile_placements_proj.csv"
js_filename = "../data/atlas_complete.json"

names, descriptions = read_picture_names_and_descriptions(js_filename)

In [4]:

#TODO: How far two vertices should be to be connected (1-4)?
G, ups = create_graph(input_file, projects_to_remove, 4, min_x, max_x, min_y, max_y, file_prefix="comparison")

print("num edges = ", G.n_edges)

num edges =  326217


In [5]:
pfile = open('graph_10x10.pkl', 'wb')
pickle.dump(G, pfile)
pfile.close()

pfile = open('ups_10x10.pkl', 'wb')
pickle.dump(ups, pfile)
pfile.close()

In [6]:
pfile = open('graph_10x10.pkl', 'rb')
G = pickle.load(pfile)
pfile.close()

pfile = open('ups_10x10.pkl', 'rb')
ups = pickle.load(pfile)
pfile.close()

In [7]:
#Defining 7 edge features and computing the information they require
#Adding a new feature without changing the rest of the code should
#be easy.
#TODO: Are there other features that would improve the segmentation?
#TODO: How many dimensions we need?

def different_color(i, j, ups, data=None):
    if ups[i][4] == ups[j][4]:
        return 0
    else:
        return 1
        
def distance_space(i, j, ups, data=None):
    xi = ups[i][2]
    yi = ups[i][3]
    xj = ups[j][2]
    yj = ups[j][3]
    
    return np.sqrt(pow(xi-xj,2)+pow(yi-yj,2))

def distance_time(i, j, ups, data=None):
    time_i = ups[i][0]
    time_j = ups[j][0]
    
    return np.sqrt(pow(time_i-time_j,2))

def distance_duration(i, j, ups, durations):
    return dist_duration(durations[i], durations[j])

def distance_color(i, j, ups, conflicts):
    color_i = ups[i][4]
    color_j = ups[j][4]
    
    if color_i == color_j:
        return 0
    else:
        max_up = len(ups)
        dist = 0
        
        conf_i = []
        if conflicts[i][0] <= max_up:
            conf_i.append(ups[conflicts[i][0]][4])
            
        if conflicts[i][1] <= max_up:
            conf_i.append(ups[conflicts[i][1]][4])
        
        conf_j = []
        if conflicts[j][0] <= max_up:
            conf_j.append(ups[conflicts[j][0]][4])
            
        if conflicts[j][1] <= max_up:
            conf_j.append(ups[conflicts[j][1]][4])
        
        if color_i in conf_j:
            dist = dist + 1
            
        if color_j in conf_i:
            dist = dist + 1
        
        return dist
    
def distance_user_embedding(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return np.linalg.norm(data['emb'][user_i_id]-data['emb'][user_j_id])

def distance_user_colors(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return (1.-data['emb'][user_i_id].todense() * data['emb'][user_j_id].todense().T)[0,0]

In [8]:
conflicts = compute_update_conflicts(ups)
durations = compute_update_durations(ups)
user_color, user_index_color = compute_user_color(ups)

#TODO: We are currently using 40 dimensions, we might need more
# We also need to understand whether these other parameters matter.
ndim=40
threshold=10
total_samples=200
n_negatives=5
n_iterations=10
user_index, emb = embed_users(G, ups, ndim, threshold, total_samples, n_negatives, n_iterations)

features = [{'name': "different_color", 'func': different_color, 'data': None}, 
    {'name': "distance_space",  'func': distance_space, 'data': None}, 
    {'name': "distance_time", 'func': distance_time, 'data': None}, 
    {'name': "distance_duration", 'func': distance_duration, 'data': durations}, 
    {'name': "distance_color", 'func': distance_color, 'data': conflicts},
    {'name': "distance_user_embedding", 'func': distance_user_embedding, 'data': {'index': user_index, 'emb': emb}},
    {'name': "distance_user_colors", 'func': distance_user_colors, 'data': {'index': user_index_color, 'emb': user_color}}]

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 10 -s 200
reddit_place_project
signet

avg pos =  0.5944419427703248 , n =  110532
avg neg =  1.7963627534854916 , n =  21269


In [9]:
pfile = open('features_10x10.pkl', 'wb')
pickle.dump(features, pfile)
pfile.close()

In [10]:
pfile = open('features_10x10.pkl', 'rb')
features = pickle.load(pfile)
pfile.close()

In [11]:
locations = store_locations("../data/atlas_complete.json")
    
# All edges that belong to the validation fold need to be excluded
A, b = build_feat_label_data(G.unique_edges_file_name, ups, features)

In [12]:
model_gboost = GradientBoostingRegressor(random_state=1, n_estimators=25).fit(A, b)

In [13]:
model_nn = createNonlinearRegressionNeuralNet(A, b)

  model.fit(A, b, nb_epoch=256, batch_size=2, verbose=2)


Epoch 1/256
 - 84s - loss: 1781518780.1014 - accuracy: 0.5606
Epoch 2/256
 - 85s - loss: 10162.5228 - accuracy: 0.7775
Epoch 3/256
 - 86s - loss: 1608.7840 - accuracy: 0.7776
Epoch 4/256
 - 88s - loss: 8.6692 - accuracy: 0.7775
Epoch 5/256
 - 88s - loss: 361918.3903 - accuracy: 0.7775
Epoch 6/256
 - 88s - loss: 708.3751 - accuracy: 0.7774
Epoch 7/256
 - 87s - loss: 14361.6544 - accuracy: 0.7775
Epoch 8/256
 - 86s - loss: 805.8546 - accuracy: 0.7775
Epoch 9/256
 - 85s - loss: 4.5093 - accuracy: 0.7777
Epoch 10/256
 - 86s - loss: 31305.7205 - accuracy: 0.7776
Epoch 11/256
 - 86s - loss: 182.0996 - accuracy: 0.7775
Epoch 12/256
 - 86s - loss: 80.6600 - accuracy: 0.7777
Epoch 13/256
 - 85s - loss: 240245.0437 - accuracy: 0.7776
Epoch 14/256
 - 85s - loss: 10972.3147 - accuracy: 0.7776
Epoch 15/256
 - 85s - loss: 903.2094 - accuracy: 0.7778
Epoch 16/256
 - 86s - loss: 6864819.8077 - accuracy: 0.7776
Epoch 17/256
 - 86s - loss: 52494.1808 - accuracy: 0.7777
Epoch 18/256
 - 86s - loss: 120873

Epoch 147/256
 - 101s - loss: 0.1780 - accuracy: 0.7779
Epoch 148/256
 - 81s - loss: 211.4207 - accuracy: 0.7779
Epoch 149/256
 - 103s - loss: 0.1730 - accuracy: 0.7779
Epoch 150/256
 - 83s - loss: 0.1728 - accuracy: 0.7779
Epoch 151/256
 - 84s - loss: 99.9511 - accuracy: 0.7779
Epoch 152/256
 - 82s - loss: 11.7073 - accuracy: 0.7780
Epoch 153/256
 - 82s - loss: 0.1728 - accuracy: 0.7778
Epoch 154/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoch 155/256
 - 85s - loss: 0.1727 - accuracy: 0.7779
Epoch 156/256
 - 86s - loss: 0.1728 - accuracy: 0.7779
Epoch 157/256
 - 86s - loss: 0.1728 - accuracy: 0.7779
Epoch 158/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoch 159/256
 - 86s - loss: 0.1728 - accuracy: 0.7779
Epoch 160/256
 - 85s - loss: 0.1728 - accuracy: 0.7778
Epoch 161/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoch 162/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoch 163/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoch 164/256
 - 85s - loss: 0.1728 - accuracy: 0.7779
Epoc

In [17]:
kappa = 0.25

compute_edge_weights_multithread(G, ups, model_gboost, features, 5)
G.sort_edges()



comp_assign = region_segmentation(G, ups, kappa)
regions, sizes = extract_regions(comp_assign)
ground_truth = create_ground_truth(input_file, min_x=min_x, max_x=max_x, min_y=min_y, max_y=max_y, projects_to_remove=projects_to_remove)
num_correct_counter, num_assignments_made, precision, recall, region_assignments = evaluate( locations, regions, ups, ground_truth, threshold=0.3, draw=False)

print("Recall:", recall)
print("Precision:", precision)

Recall: 1.0
Precision: 0.0002232142857142857


In [None]:
compute_edge_weights_multithread(G, ups, model_nn, features, 5)
G.sort_edges()



comp_assign = region_segmentation(G, ups, kappa)
regions, sizes = extract_regions(comp_assign)
ground_truth = create_ground_truth(input_file, min_x=min_x, max_x=max_x, min_y=min_y, max_y=max_y, projects_to_remove=projects_to_remove)
num_correct_counter, num_assignments_made, precision, recall, region_assignments = evaluate( locations, regions, ups, ground_truth, threshold=0.3, draw=False)

print("Recall:", recall)
print("Precision:", precision)

In [None]:
print("Recall:", recall)
print("Precision:", precision)