In [1]:
import sys
import os
  
# # getting the name of the directory
# # where the this file is present.
current = os.path.dirname(os.path.realpath("DatasetAnalysis.ipynb"))
  
# # Getting the parent directory name
# # where the current directory is present.
parent = os.path.dirname(current)
  
# # adding the parent directory to 
# # the sys.path.
sys.path.append(parent)
# # print(parent)

In [58]:
import numpy as np
import matplotlib.pyplot as plt
from graphsage.utils2 import load_cora, load_ppi, load_wikics, custom_load_pubmed

In [4]:
np.random.seed(1)
rand_indices = np.random.permutation(2708)
rand_indices

array([ 996, 1140, 1145, ..., 1096,  235, 1061])

In [5]:
data_dic = load_cora({})

In [6]:
data_dic.keys()

dict_keys(['feat_data', 'labels', 'adj_lists', 'train_mask', 'test_mask', 'val_mask', 'distances', 'cluster_labels', 'freq', 'dist_in_graph', 'centralityev', 'centralitybtw', 'centralityh', 'centralityd', 'lsh_neighbour_list'])

In [10]:
data_dic['feat_data'].shape

(2708, 1433)

In [11]:
data_dic['labels'].shape

(2708, 1)

In [5]:
def get_centroid(feat_data):
    return feat_data.mean(axis=0)
def get_mean_distance_from_centroid(points,distance_type='euclidean'):
    distance_ = cdist(points,get_centroid(points).reshape(1,-1),metric=distance_type).mean(axis=0)[0]
    return distance_
def get_distance_to_other_centroids(points,centroids_mod,distance_type='euclidean',measure='nearest'):
    dist_mat = cdist(points,centroids_mod,metric=distance_type)
    if measure == 'nearest':
        interdist = dist_mat.min(axis=1).mean()
    elif measure == 'mean':
        interdist = dist_mat.mean(axis=1).mean()
    else:
        print('specify correct measure')
        return 0
    return interdist

In [17]:
sample_data = np.random.random((10,20))
get_centroid(sample_data).shape

(20,)

In [34]:
from scipy.spatial.distance import cdist
cdist(sample_data,get_centroid(sample_data).reshape(1,-1)).mean(axis=0)

array([1.21868423])

In [25]:
from scipy.spatial import distance
distance.euclidean(sample_data[5],get_centroid(sample_data))

1.1162463070332478

In [33]:
get_mean_distance_from_centroid(sample_data)

array([1.21868423])

In [35]:
sample1=np.random.random((10,20))
sample2=np.random.random((10,20))
centroid1,centroid2 = get_centroid(sample1),get_centroid(sample2)

In [51]:
cdist(sample1,np.array([centroid1,centroid2]),metric='euclidean').mean(axis=1).mean()

1.2709469765996286

In [39]:
cdist(sample1,centroid2.reshape(1,-1))

array([[1.50771416],
       [1.22161472],
       [1.39401183],
       [1.31493144],
       [1.44248919],
       [1.19471073],
       [1.44638972],
       [1.31527031],
       [1.02692004],
       [1.42214245]])

In [100]:
get_distance_to_other_centroids(sample2,np.array([centroid1]),measure='nearest')

1.3647179032374468

In [90]:
a=[1,2,3]
a.pop(1)
print(a)
np.zeros(10)

[1, 3]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [60]:
sorted(list(np.unique(data_dic['labels'])))

[0, 1, 2, 3, 4, 5, 6]

In [6]:
def get_ratio(list1,list2):
    temp=[]
    for i in range(len(list1)):
        temp.append(list1[i]/list2[i])
    return temp
#get_ratio(mean_distance_owncentroid,mean_distance_othercentroid)

In [7]:
def give_distance_statistics(feat_data, labels, measure='nearest',distance_type='euclidean'):
    feat_data = np.array(data_dic['feat_data'])
    labels = np.array(data_dic['labels'])
    mean_distance_owncentroid = []
    mean_distance_othercentroid = []
    centroids = []
    for i in sorted(list(np.unique(labels))):
        centroids.append(get_centroid(feat_data[(labels == i).reshape(-1,)]).tolist())
    for i in sorted(list(np.unique(labels))):
        temp = centroids.copy()
        temp.pop(i)
        mean_distance_owncentroid.append(get_mean_distance_from_centroid(feat_data[(labels == i).reshape(-1,)],distance_type=distance_type))
        mean_distance_othercentroid.append(get_distance_to_other_centroids(feat_data[(labels == i).reshape(-1,)],temp,distance_type=distance_type,measure=measure))
        if i == 8:
            print(mean_distance_othercentroid)
    return get_ratio(mean_distance_owncentroid,mean_distance_othercentroid)

In [42]:
def avg(mylist):
    return sum(mylist)/len(mylist)
def label_rate(adj_list,labels):
    same_label_count=[]
    diff_label_count=[]
    same_label_count_ld = []
    diff_label_count_ld = []
    for node, neighbours in adj_list.items():
        current_node_label = labels[node]
        count1,count2=0,0
        for neigh in list(neighbours):
            if current_node_label == labels[neigh]:
                count1+=1
            else:
                count2+=1
        if len(neighbours) == 0:
            same_label_count.append(0)
            diff_label_count.append(0)
        else:
            if len(neighbours) <= 5:
                same_label_count_ld.append(count1/len(neighbours))
                diff_label_count_ld.append(count2/len(neighbours))
            same_label_count.append(count1/len(neighbours))
            diff_label_count.append(count2/len(neighbours))
    #print(same_label_count,diff_label_count)
    return avg(same_label_count),avg(diff_label_count),avg(same_label_count_ld),avg(diff_label_count_ld) 

In [36]:
labels = [1,1,2,0,0]
adj_list = {0:[1,2],1:[0],2:[0,3],3:[2,4],4:[3]}
label_rate(adj_list,labels)

(0.6, 0.4, 0.6, 0.4)

In [75]:
feat_data[(labels == 0).reshape(-1,)].shape

(818, 1433)

In [76]:
(labels == 0).reshape(-1,).sum()

818

In [102]:
mean_distance_owncentroid

[3.9880151440213134,
 4.108184312386862,
 4.071730872232723,
 4.060509463212485,
 4.042021659329825,
 3.9244409164180585,
 4.067456678249878]

In [103]:
mean_distance_othercentroid

[4.035900361897457,
 4.166184735701382,
 4.146997789248885,
 4.11969111078311,
 4.087108511146307,
 4.015141303057532,
 4.1391663917048485]

### Cora

In [19]:
data_dic_cora = load_cora(lsh_helper=lsh_helper, augment_khop=True)

creating lsh
features copy shape  (2708, 1433)
done


In [37]:
print(label_rate(data_dic_cora['lsh_neighbour_list'],data_dic_cora['labels']))
print(label_rate(data_dic_cora['adj_lists'],data_dic_cora['labels']))

(0.22046566903472348, 0.7706717017186011, 0.20948835433371535, 0.7905116456662847)
(0.8251578275927925, 0.17484217240720734, 0.8308962607303944, 0.16910373926960548)


In [113]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels']))

0
1
2
3
4
5
6
[0.9881351833340032, 0.9860782881715504, 0.9818502635300912, 0.9856344454039965, 0.988968520974297, 0.9774104122885028, 0.9826753247710261]


In [114]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean'))

0
1
2
3
4
5
6
[0.9692983317758183, 0.9683800747977145, 0.9633841548617899, 0.9663594993491936, 0.972334162186959, 0.9601267087329962, 0.967542561617903]


In [115]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean',distance_type='cosine'))

0
1
2
3
4
5
6
[0.8937097653366343, 0.8704860321659481, 0.848904939208603, 0.8798365281305003, 0.8898270789309762, 0.8487707097839141, 0.8703702645561137]


In [116]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='nearest',distance_type='cosine'))

0
1
2
3
4
5
6
[0.9536853496383274, 0.9380107860359165, 0.9244451449036671, 0.9387265683905869, 0.9550732841329769, 0.9075683141726592, 0.923312263347089]


## WikiCS

In [9]:
lsh_helper = {'n_vectors': 16, 'search_radius': 2, 'num_lsh_neighbours': 10,'atleast': False, 'includeNeighbourhood':False}
data_dic = load_wikics(lsh_helper=lsh_helper, augment_khop=True)

Loading WikiCS
loading dataframe of
Loading normal random walks
loading khop neighbours
creating lsh
features copy shape  (11701, 300)
done


In [11]:
data_dic['lsh_neighbour_list']

{0: [2765, 5587, 1740, 10090, 6156, 164, 4977, 8916, 2268, 1217],
 1: [3283, 11605, 6797, 3496, 5856, 8985, 6984, 11103, 5004, 4248],
 2: [7983, 8069, 394, 1031, 2399, 7497, 11184, 45, 2877, 9835],
 3: [10286, 4149, 10518, 10453, 7331, 10459, 7168, 5242, 10294, 1562],
 4: [3088, 7527, 1381, 9793, 2837, 4256, 4785, 8095, 2646, 7723],
 5: [5146, 2934, 10924, 5906, 2639, 6735, 8523, 7570, 10147, 5181],
 6: [2162, 4041, 11588, 1015, 6592, 9989, 2437, 3088, 1381, 8849],
 7: [7306, 10267, 10280, 2002, 7715, 9817, 11505, 7946, 2043, 3652],
 8: [971, 9087, 11583, 3072, 3116, 10132, 5665, 507, 11152, 2865],
 9: [3277, 1552, 345, 443, 9714, 8524, 7732, 1349, 10703, 477],
 10: [3085, 9149, 2582, 6144, 2522, 7164, 7839, 7599, 9798, 6798],
 11: [10915, 2409, 11345, 5254, 7037, 4863, 3807, 10279, 7736, 1688],
 12: [6536, 7386, 2505, 10395, 1019, 10083, 7671, 8587, 3733, 1051],
 13: [537, 2027, 7804, 10378, 5179, 10036, 1511, 11584, 3914, 10221],
 14: [4084, 9868, 6541, 6364, 6964, 3150, 3770, 9485, 

In [43]:
print(label_rate(data_dic['lsh_neighbour_list'],data_dic['labels']))
print(label_rate(data_dic['adj_lists'],data_dic['labels']))

(0.6589251630914668, 0.34073298578468414, 0.39, 0.61)
(0.6496985022582711, 0.3122705602150209, 0.7182020357540007, 0.2817979642460016)


In [126]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels']))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean'))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean',distance_type='cosine'))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='nearest',distance_type='cosine'))

[1.283969808161322, 1.282474237865921, 1.1821322283371603, 1.3267944629062163, 1.2518883415848734, 1.2694322883897027, 1.2282115901627089, 1.1882865666530333, 1.226803136828503]
[0.8828396447395558, 0.9988908937437361, 1.0130244160135158, 0.9377665672085207, 0.9919740757645626, 0.9192803341002034, 0.8780083788814119, 0.9702896024873485, 1.000099628138771, 0.9634512160207751]
[1.5272010746458093, 1.4913604673648104, 1.4061397249453949, 1.5348433766507712, 1.4727224957939533, 1.4385205816738629, 1.4424372584743985, 1.410394836939244, 1.4464607065447967]
[0.7422332711207326, 0.858982027281967, 0.8516428268239928, 0.8106517627784886, 0.8432279564892518, 0.8112251941717733, 0.7476096869146847, 0.8174888833973342, 0.8482258490605364, 0.8203066469827547]
[0.16402085037121006, 0.14064909445721105, 0.13626012863638032, 0.17594823406835172, 0.14340024727573722, 0.13600415767826182, 0.13110873601035836, 0.1325272108756037, nan]
[0.5510849901947928, 0.7314336042691885, 0.7216309161667189, 0.678406

## Pubmed

In [53]:
def custom_load_pubmed(lsh_helper, random_walk=False, root_folder='/home/thummala/graphsage-pytorch/datasets/pubmed-data',
                teleport_khop=False, augment_khop=False, dfactor=2, use_centroid=False, teleport=0.2, planetoid = True, load_embeds = False):
    num_nodes = 19717
    num_feats = 500
    feat_data = np.zeros((num_nodes, num_feats))
    labels = np.empty((num_nodes, 1), dtype=np.int64)
    node_map = {}
    degrees = []
    train_mask = []
    test_mask = []
    val_mask = []
    lsh_neighbourlist_dic = {}
    if planetoid:
        adj_lists, feat_data, labels, train_mask, val_mask, test_mask = load_data('pubmed')
    else:
        with open(root_folder + "/Pubmed-Diabetes.NODE.paper.tab") as fp:
            fp.readline()
            feat_map = {entry.split(":")[1]: i - 1 for i, entry in enumerate(fp.readline().split("\t"))}
            for i, line in enumerate(fp):
                info = line.split("\t")
                node_map[info[0]] = i
                labels[i] = int(info[1].split("=")[1]) - 1
                for word_info in info[2:-1]:
                    word_info = word_info.split("=")
                    feat_data[i][feat_map[word_info[0]]] = float(word_info[1])
        adj_lists = defaultdict(set)
        with open(root_folder + "/Pubmed-Diabetes.DIRECTED.cites.tab") as fp:
            fp.readline()
            fp.readline()
            for line in fp:
                info = line.strip().split("\t")
                paper1 = node_map[info[1].split(":")[1]]
                paper2 = node_map[info[-1].split(":")[1]]
                adj_lists[paper1].add(paper2)
                adj_lists[paper2].add(paper1)
    if load_embeds:
        try:
            feat_data = np.load('/home/thummala/graphsage-pytorch/res/pubmed/gswithemebeds/allnodeembeddings.npy')
            print('loading gs embeddings as node features ', feat_data.shape)
        except:
            print("Unable to load emmbeddings...working with input feature data")
    if teleport_khop or augment_khop:
        print('creating lsh')
        lsh_cand_dic = return_lsh_candidates(np.array(feat_data), n_vectors=lsh_helper['n_vectors'],
                                             num_lsh_neighbours=lsh_helper['num_lsh_neighbours'],
                                             atleast=lsh_helper['atleast'], search_radius=lsh_helper['search_radius'])
        print('done')
    else:
        lsh_cand_dic = {}

    for key, value in adj_lists.items():
        node = int(key)
        if augment_khop:
            if np.all((feat_data[node] == 0)):
                lsh_neighbourlist_dic[node] = []
            else:
                lsh_neighbourlist_dic[node] = list(lsh_cand_dic[node].keys())

    if teleport_khop:
        adj_lists = Do_Teleport_Khop(adj_list=adj_lists, lsh_cand_dic=lsh_cand_dic, feats=feat_data, dfactor=dfactor)

    for i in range(num_nodes):
        degrees.append(len(list(adj_lists[i])))

    data_loader_dic = {'feat_data': feat_data, 'labels': labels, 'adj_lists': adj_lists, 'train_mask': train_mask,
                       'test_mask': test_mask, 'val_mask': val_mask, 'distances': [],
                       'cluster_labels': [], 'freq': {}, 'dist_in_graph': {},
                       'centralityev': [], 'centralitybtw': [], 'centralityh': [],
                       'centralityd': degrees, 'lsh_neighbour_list': lsh_neighbourlist_dic}

    return data_loader_dic

In [59]:
data_dic2 = custom_load_pubmed(lsh_helper={})

loading gs embeddings as node features  (19717, 128)


In [120]:
print(give_distance_statistics(data_dic['feat_data'],data_dic['labels']))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean'))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='mean',distance_type='cosine'))

print(give_distance_statistics(data_dic['feat_data'],data_dic['labels'],measure='nearest',distance_type='cosine'))

0
1
2
[0.9665461569992628, 0.9908422105892141, 0.9781508772120524]
0
1
2
[0.9605801211430621, 0.97169988168909, 0.9582733954288206]
0
1
2
[0.828178198162361, 0.8966486642092393, 0.868239573981817]
0
1
2
[0.8522521899657841, 0.9572069469908958, 0.9153486606916252]


In [57]:
data_dic['adj_lists']

defaultdict(list,
            {0: [14442, 1378, 1544, 6092, 7636],
             1: [10199, 8359, 2943],
             2: [11485, 15572, 10471],
             3: [8249],
             4: [14044],
             5: [1312, 12968],
             6: [17284,
              8661,
              3150,
              18614,
              7296,
              2216,
              8981,
              13656,
              6572,
              3509,
              13655,
              12098,
              7691,
              9232,
              7335,
              4464,
              2128,
              16720,
              767,
              6697,
              18121,
              10265],
             7: [15577,
              14688,
              17955,
              18425,
              6242,
              2343,
              17354,
              14754,
              5564,
              2019,
              19376,
              1568,
              14843,
              1588,
              4058,
              1

In [65]:
data_dic2['adj_lists']

defaultdict(set,
            {10816: {101,
              313,
              318,
              472,
              1862,
              2110,
              5607,
              5870,
              6169,
              6940,
              8153,
              10017,
              10645,
              11050,
              11684,
              12307,
              12499,
              15078,
              15348,
              15773,
              17021,
              17562,
              18059,
              18459,
              19450},
             18459: {8101, 10816},
             8069: {412,
              1682,
              2485,
              3530,
              3699,
              3834,
              4030,
              4078,
              5927,
              6421,
              7718,
              11420,
              12522,
              13841,
              14085,
              14272,
              14852,
              14863,
              15413,
              16829,
              16

## Planetoid

In [70]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys


def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    dir_path = "/home/thummala/graphsage-pytorch/datasets/planetoid_data/"
    for i in range(len(names)):
        with open(dir_path+"ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file(dir_path+"ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    features = features.toarray()
    #adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
    adj=graph
    for node, neighbours in adj.items():
        adj[node] = set(neighbours)
    
    labels = np.vstack((ally, ty)).argmax(axis=1)
    labels[test_idx_reorder] = labels[test_idx_range]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

#     y_train = np.zeros(labels.shape)
#     y_val = np.zeros(labels.shape)
#     y_test = np.zeros(labels.shape)
#     y_train[train_mask, :] = labels[train_mask, :]
#     y_val[val_mask, :] = labels[val_mask, :]
#     y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, labels, train_mask, val_mask, test_mask

In [78]:
adj, features, labels, train_mask, val_mask, test_mask = load_data('pubmed')

In [79]:
labels[20:25]

array([2, 1, 2, 2, 2])

In [69]:
labels2[20:25]

array([[0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]], dtype=int32)

In [51]:
train_mask.shape

(19717,)

In [52]:
features.shape

(19717, 500)

In [72]:
adj

defaultdict(list,
            {0: {1378, 1544, 6092, 7636, 14442},
             1: {2943, 8359, 10199},
             2: {10471, 11485, 15572},
             3: {8249},
             4: {14044},
             5: {1312, 12968},
             6: {767,
              2128,
              2216,
              3150,
              3509,
              4464,
              6572,
              6697,
              7296,
              7335,
              7691,
              8661,
              8981,
              9232,
              10265,
              12098,
              13655,
              13656,
              16720,
              17284,
              18121,
              18614},
             7: {1568,
              1588,
              2019,
              2343,
              4058,
              5564,
              6242,
              8335,
              10243,
              14688,
              14754,
              14843,
              15577,
              17354,
              17955,
              18

In [77]:
train = np.array([i for i, x in enumerate(test_mask) if x])
train.shape

(1000,)

In [None]:
test = np.array([i for i, x in enumerate(data_dic['test_mask']) if x])
            val = np.array([i for i, x in enumerate(data_dic['val_mask']) if x])
            train = np.array([i for i, x in enumerate(data_dic['train_mask']) if x])