Determine whether to use the GradientBoostingRegressor from sklearn or a Neural Network.
Evaluate which model gets better results

In [1]:
import csv
import numpy as np
import scipy
import networkx as nx
import math
import sys
import os
import subprocess
import scipy.stats
import scipy.optimize
import operator
from sklearn.manifold import TSNE
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

sys.path.append("../Python_code") # go to parent dir
from canvas_vis import * 
from analytics_combined import *
from generate_proj_to_remove import *
from project_data_analysis import *
from user_embedding import *
from segmentation import *
from evaluation import *
from nonlinear_regressor import *
import pickle

Using TensorFlow backend.


In [2]:
#Determine the size of the canvas that is being looked at
min_x = 450
max_x = 550
min_y = 450
max_y = 550


In [3]:
projects_to_remove = get_list_of_removed_proj(output_filename = "../data/proj_to_remove.txt")

input_file= "../data/sorted_tile_placements_proj.csv"
js_filename = "../data/atlas_complete.json"

names, descriptions = read_picture_names_and_descriptions(js_filename)

In [None]:
file_prefix = "10x10"

In [4]:

#TODO: How far two vertices should be to be connected (1-4)?
G, ups = create_graph(input_file, projects_to_remove, 4, min_x, max_x, min_y, max_y, file_prefix="comparison")

print("num edges = ", G.n_edges)

num edges =  12274043


In [5]:
pfile = open(file_prefix + 'graph.pkl', 'wb')
pickle.dump(G, pfile)
pfile.close()

pfile = open(file_prefix + 'ups.pkl', 'wb')
pickle.dump(ups, pfile)
pfile.close()

In [6]:
pfile = open(file_prefix + 'graph.pkl', 'rb')
G = pickle.load(pfile)
pfile.close()

pfile = open(file_prefix + 'ups.pkl', 'rb')
ups = pickle.load(pfile)
pfile.close()

In [7]:
#Defining 7 edge features and computing the information they require
#Adding a new feature without changing the rest of the code should
#be easy.
#TODO: Are there other features that would improve the segmentation?
#TODO: How many dimensions we need?

def different_color(i, j, ups, data=None):
    if ups[i][4] == ups[j][4]:
        return 0
    else:
        return 1
        
def distance_space(i, j, ups, data=None):
    xi = ups[i][2]
    yi = ups[i][3]
    xj = ups[j][2]
    yj = ups[j][3]
    
    return np.sqrt(pow(xi-xj,2)+pow(yi-yj,2))

def distance_time(i, j, ups, data=None):
    time_i = ups[i][0]
    time_j = ups[j][0]
    
    return np.sqrt(pow(time_i-time_j,2))

def distance_duration(i, j, ups, durations):
    return dist_duration(durations[i], durations[j])

def distance_color(i, j, ups, conflicts):
    color_i = ups[i][4]
    color_j = ups[j][4]
    
    if color_i == color_j:
        return 0
    else:
        max_up = len(ups)
        dist = 0
        
        conf_i = []
        if conflicts[i][0] <= max_up:
            conf_i.append(ups[conflicts[i][0]][4])
            
        if conflicts[i][1] <= max_up:
            conf_i.append(ups[conflicts[i][1]][4])
        
        conf_j = []
        if conflicts[j][0] <= max_up:
            conf_j.append(ups[conflicts[j][0]][4])
            
        if conflicts[j][1] <= max_up:
            conf_j.append(ups[conflicts[j][1]][4])
        
        if color_i in conf_j:
            dist = dist + 1
            
        if color_j in conf_i:
            dist = dist + 1
        
        return dist
    
def distance_user_embedding(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return np.linalg.norm(data['emb'][user_i_id]-data['emb'][user_j_id])

def distance_user_colors(i, j, ups, data):
    user_i = ups[i][1]
    user_j = ups[j][1]
    user_i_id = data['index'][user_i]
    user_j_id = data['index'][user_j]
    
    return (1.-data['emb'][user_i_id].todense() * data['emb'][user_j_id].todense().T)[0,0]

In [None]:
conflicts = compute_update_conflicts(ups)
durations = compute_update_durations(ups)
user_color, user_index_color = compute_user_color(ups)

#TODO: We are currently using 40 dimensions, we might need more
# We also need to understand whether these other parameters matter.
ndim=40
threshold=10
total_samples=200
n_negatives=5
n_iterations=10
user_index, emb = embed_users(G, ups, ndim, threshold, total_samples, n_negatives, n_iterations)

features = [{'name': "different_color", 'func': different_color, 'data': None}, 
    {'name': "distance_space",  'func': distance_space, 'data': None}, 
    {'name': "distance_time", 'func': distance_time, 'data': None}, 
    {'name': "distance_duration", 'func': distance_duration, 'data': durations}, 
    {'name': "distance_color", 'func': distance_color, 'data': conflicts},
    {'name': "distance_user_embedding", 'func': distance_user_embedding, 'data': {'index': user_index, 'emb': emb}},
    {'name': "distance_user_colors", 'func': distance_user_colors, 'data': {'index': user_index_color, 'emb': user_color}}]

python ../../signet/signet.py -l signet_id.txt -i signet.txt -o signet -d 40 -t 10 -s 200
reddit_place_project
signet



In [None]:
pfile = open(file_prefix + 'features.pkl', 'wb')
pickle.dump(features, pfile)
pfile.close()

In [None]:
pfile = open(file_prefix + 'features.pkl', 'rb')
features = pickle.load(pfile)
pfile.close()

In [None]:
locations = store_locations("../data/atlas_complete.json")
folds = create_folds(min_x, min_y, max_x, max_y)

# List of dictionaries containing min_x, max_x, min_y, max_y for each fold
fold_boundaries = []
for fold in folds:
    fold_boundaries.append(get_fold_border(fold))

# All edges that belong to the validation fold need to be excluded
A_train, b_train = build_feat_label_data(G.unique_edges_file_name, ups, features, fold_boundaries=fold_boundaries, excluded_folds=[0])
A_validate, b_validate = build_feat_label_data(G.unique_edges_file_name, ups, features, fold_boundaries=fold_boundaries, excluded_folds=[1,2,3,4,5,6,7,8,9])

In [None]:
model_gboost = GradientBoostingRegressor(random_state=1, n_estimators=25).fit(A_train, b_train)

In [None]:
print(A_train.shape)
print(b_train.shape)
print(A_validate.shape)
print(b_validate.shape)

In [None]:
print(len(fold_boundaries))
print(fold_boundaries)

In [None]:
model_nn = createNonlinearRegressionNeuralNet(A_train, b_train, A_validate, b_validate)

In [None]:
ground_truth = create_ground_truth(input_file, min_x=min_x, max_x=max_x, min_y=min_y, max_y=max_y, projects_to_remove=projects_to_remove, partial_canvas_boundaries=fold_boundaries[0])

In [None]:
kappa = 0.25

compute_edge_weights_multithread(G, ups, model_gboost, features, 5, file_prefix = file_prefix)
G.sort_edges()



comp_assign = region_segmentation(G, ups, kappa)
regions, sizes = extract_regions(comp_assign)
num_correct_counter, num_assignments_made, precision, recall, region_assignments = evaluate( locations, regions, ups, ground_truth, threshold=0.3, draw=False)

print("Recall:", recall)
print("Precision:", precision)

In [None]:
compute_edge_weights(G, ups, model_nn, features)
G.sort_edges()



comp_assign = region_segmentation(G, ups, kappa)
regions, sizes = extract_regions(comp_assign)
num_correct_counter, num_assignments_made, precision, recall, region_assignments = evaluate( locations, regions, ups, ground_truth, threshold=0.3, draw=False)

print("Recall:", recall)
print("Precision:", precision)

In [None]:
print("Recall:", recall)
print("Precision:", precision)