# Inspect the trained embedding

In [77]:
import numpy as np

from bokeh.io import output_file, output_notebook, show
from bokeh.plotting import figure
from bokeh.transform import linear_cmap
from bokeh.util.hex import hexbin
from bokeh.models import HoverTool
from bokeh import colors

from gzip import open as gopen

import gensim.models.poincare as poincare

## Load trained model

In [79]:
model = poincare.PoincareModel.load("amazon_purchases_embedding.pkl")

## Load groud truth products communities

In [91]:
communities_truth = "../input/amazon_purchases/com-amazon.top5000.cmty.txt.gz"

communities = []

with gopen(communities_truth,mode='rb') as gin:
    for iline,line in enumerate(gin.readlines()):
        line = line.decode('ascii')
        if line[0].isnumeric():
            toks = line.split("\t")
            communities.append( list(int(tok) for tok in toks) )
        if iline<=10:print(line.rstrip("\n"))
            
# we sort the communities by size
communities.sort(key=lambda x: len(x),reverse=True)

164985	225214	232761
105089	282815	331787
167906	288975	421279	442612	451312
69880	231953	518997	523128
135499	160294	304770
112813	112814	112821	112823
112813	112814	112821	112823
199372	399560	447268	471226	522928
179001	391697	412528
21166	207188	405926	531532	540207
118948	191846	209822	455700	482725


### Retrieve the representation of the ground truth communities

More details on the communities definition can be found [here](https://arxiv.org/abs/1205.6233).

In [92]:
communities_xy = [ np.vstack(  [ model.kv.get_vector(str(x)) for x in community ] ) for community in communities ]




## Let's now start visualizing the embedding

In [93]:
# initialize brokeh output
output_notebook()

### We start by visualizing the embedded graph

By construction, the embedding space is the Poincarè ball model of an hyperbolic space, i.e. a unit circle (in 2 dimensions for this example), equipped with the metric tensor $ g_x = \left ( \frac {2} { 1 - ||x|| } \right )^2 \cdot g_E $.

The model should put the top elements of the the graph hierarchy in the centre of the circle, while more periferal points should be pushed towards the outer region of the circle.


In [94]:
p = figure(title="Amazon purchases network 2D embedding", #tools="wheel_zoom,pan,reset",
           match_aspect=True, background_fill_color='#440154')
p.grid.visible = False

bins = p.hexbin(model.kv.vectors[:,0],model.kv.vectors[:,1], 0.01, hover_color="pink", hover_alpha=0.8)
    
show(p)

and indeed this is the structure that we observe!

The representation seems pretty crowded with only 2 dimensions.

### Embedding of the ground truth communities

To assess the quality of the embedding, let's investigate how the model is organizing the ground truth communities of products.


In [95]:
p = figure(title="Amazon purchases network 2D embedding", #tools="wheel_zoom,pan,reset",
           match_aspect=True, background_fill_color='#440154')
p.grid.visible = False

bins = p.hexbin(model.kv.vectors[:,0],model.kv.vectors[:,1], 0.01, hover_color="pink", hover_alpha=0.8)

# trick to get a different color for each community 
points_colors = colors.named.__all__

for col,comm in zip(points_colors,communities_xy):
    p.scatter(comm[:,0],comm[:,1],color=col,alpha=0.5)

    
show(p)

Elements of the communities indeed end up in nearby regions of the space!

### Shape of the communities

From the plot above, we can notice that the further away a community is from tha the centre of the circle, the narrower its shape. This is related to the hyperbolic structure of the space. 

Let's investigate in some more details the shape of the communities. We will look in particular at the following metrics:
   * position of the community, in particular its radial distance
   * geometrical size of the community
   * aspect of the comunity (ie round vs elongated)

In [96]:
# we use the ratio between the largest and the smallest independent components variances
# as a measure of the aspect ratio

from sklearn.decomposition import PCA

def aspect_ratio(community):
    pca = PCA(n_components=community[0].shape[0])
    
    pca.fit(community)
    
    return(pca.explained_variance_ratio_.min()/pca.explained_variance_ratio_.max())
    



In [97]:
# we estimate the centroid giving equal weight to each node
centroids = np.vstack( [ community.mean(axis=0) for community in communities_xy ] )

# we define the diameter as the largest (hyperbolic) distance between two elements
diameters = []
for community in communities:
    nel = len(community)
    community = [ str(x) for x in community ]
    distances = []
    for ix in range(nel):
        distances.append( model.kv.distances(community[ix],community[ix:]) )
    diameters.append( np.hstack(distances).max() )
    
diameters = np.array(diameters)

aspect_ratios = np.array([ aspect_ratio(x) for x in communities_xy ])


In [98]:
p = figure(title="Dostribution of true communities diameters", tools="wheel_zoom,pan,reset,hover,save",
           match_aspect=False)#, background_fill_color='#440154')
# p.grid.visible = False

hist,edges = np.histogram(diameters,bins=200)

# hist_r,edges_r = np.histogram((centroids**2).sum(axis=1),bins=200)

p.quad(top=hist,bottom=0,left=edges[:-1],right=edges[1:])

# p.quad(top=hist_r,bottom=0,left=edges_r[:-1],right=edges_r[1:],color="sandybrown")
    
show(p)

The typical diameter of a community is between 0.1 and 0.5.

In [99]:
p = figure(title="Dostribution of true communities diameters", tools="box_zoom,pan,reset,hover,save",
           match_aspect=False)


p.hexbin((centroids**2).sum(axis=1),aspect_ratios,0.025)
    
show(p)

Communities that are futher from the origin tend to be more elongated.

In [100]:
df_comm = pd.DataFrame(columns={"x" : centroids[:,0], "y" : centroids[:,1], "diam" : diameters, "arat" : aspect_ratio})

df_comm.to_csv("embedded_communities.csv")