In [1]:
from utils import ColorRefinement,utils
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
df = utils.load_graph_repersentations()
df.head()

Unnamed: 0,Drug,target,Graph
0,CCN1C(=O)/C(=C2\SC(=S)N(CCCOC)C2=O)c2ccccc21,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1,1,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,CCC(c1nnnn1CC1CCCO1)N(CCN1CCOCC1)Cc1cc2cc(C)cc...,0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [6]:
graphs = df['Graph']

ColorRefinement.embedd_graph_with_color_refinement(graphs[0], 3,1000).loc['color_2',:]

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: color_2, Length: 1000, dtype: object

# Embed the molecules using Color Refinement
 
 
I implemented a color refinement algorithm in utils.ColorRefinement.py.
 
This technique takes an arbitrarily sized graph with arbitrary node attributes and encodes the neighborhood and  attributes such that when two nodes have the same color their local neighborhoods are identical with a False Positive rate of 1/possible colors. 
 



In [48]:
def get_N_hop_emmbedding(df, N:int, num_buckets):
    embedding_df = pd.DataFrame(index=df.index, columns=[a for a in range(num_buckets)])

    graphs = list(df['Graph'])

    for index, graph in enumerate(graphs):
        bag_of_colors = ColorRefinement.embedd_graph_with_color_refinement(graph, N, num_buckets).loc[f'color_{N-1}',:]
        embedding_df.loc[index, :] = bag_of_colors
        
    return embedding_df


In [37]:
embeddings = get_N_hop_emmbedding(df, 1, 5000)
embeddings.to_csv('data/color_refined_molecules_1_hop.csv')

embeddings = get_N_hop_emmbedding(df, 2, 5000)
embeddings.to_csv('data/color_refined_molecules_2_hop.csv')

embeddings = get_N_hop_emmbedding(df, 3, 5000)
embeddings.to_csv('data/color_refined_molecules_3_hop.csv')

embeddings = get_N_hop_emmbedding(df, 4, 5000)
embeddings.to_csv('data/color_refined_molecules_4_hop.csv')

embeddings = get_N_hop_emmbedding(df, 5, 5000)
embeddings.to_csv('data/color_refined_molecules_5_hop.csv')

embeddings = get_N_hop_emmbedding(df, 6, 5000)
embeddings.to_csv('data/color_refined_molecules_6_hop.csv')

In [44]:
df_hop_3 = pd.read_csv('data/color_refined_molecules_3_hop.csv', index_col=0)

In [45]:
df_hop_3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
df_hop_3.sum(axis=0).sort_values(ascending=False).head(10)

584     14127
133      4008
1610     2677
1607     2634
3066     2631
2562     2413
3264     1940
3607     1788
3044     1713
4252     1677
dtype: int64

This matches what I would expect the 3 hop neighborhoods have a lot of very common 3 hop neighborhoods and there are fewer very uncommon 3 hop neighborhoods.

I expect that that there will be more diversity in neighborhoods as you increase the number of hops.

In [54]:
df_hop_4 = pd.read_csv('data/color_refined_molecules_4_hop.csv', index_col=0)
print(df_hop_4.sum(axis=0).sort_values(ascending=False).head(10))

4336    3802
2729    2087
4122    1401
1281    1300
758     1293
38      1151
1658     982
1435     975
1718     950
4796     865
dtype: int64


In [53]:
df_hop_5 = pd.read_csv('data/color_refined_molecules_5_hop.csv', index_col=0)
print(df_hop_5.sum(axis=0).sort_values(ascending=False).head(10))

1523    1039
236      823
785      773
3442     734
4696     634
1862     573
820      551
2188     539
1549     517
2543     500
dtype: int64


In [55]:
df_hop_6 = pd.read_csv('data/color_refined_molecules_6_hop.csv', index_col=0)
print(df_hop_6.sum(axis=0).sort_values(ascending=False).head(10))

972     757
1003    368
2416    331
1117    327
2143    302
4133    280
4287    265
4881    264
1629    264
4589    261
dtype: int64


Each color, such as the color `972` is a 6 hop neighborhood that is found a total of 757 times throughout the data.
 
As you increase the number of hops, the sample space of possible neighborhoods gets more complex and accordingly the neighborhoods get more diverse.
 
There is a trade off between a higher number of hops, capturing more complexity, and a lower number of K having more examples to train the model on.
 
Without testing I expect the optimal number of hops to be between 3-7.
 
It is also possible that I could combine the results of each iteration.
 
This maps every graph into an int vector of size 5000.
 
By appending the vectors to each other I could increase the number of features arbitrarily.