In [3]:
import pandas as pd 
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
att = np.array(pd.read_csv('data/attributes.csv'))
att

array([[0, 'l'],
       [1, 'x'],
       [2, 'x'],
       ...,
       [1497, 'l'],
       [1498, 'f'],
       [1499, 'l']], dtype=object)

In [5]:
G = nx.read_edgelist('data/edges_train.edgelist', data=False, delimiter=',', nodetype=int)
# nx.set_node_attributes(G, att, 'community')

# nx.draw(G, node_size=50, width=0.2)

In [6]:
G.number_of_nodes(), G.number_of_edges()

(1500, 6600)

#### setting:
we have an edgelist containing 6600 links of 1500 nodes and an attribute list containing 1500 nodes with a corresponding level of a categorical attribute variable.
#### aim:
predict the missing links -> should amount to being 7333 links

# Implementation of all possible Link Prediction metrics from networkx
ADD ALL NEW FEATURES TO GETFEATURES

In [7]:
# Input: getFeature(graph, node_i, node_j)
def getFeatures(G, i, j):
    # ressource allocation index
    ra = list(nx.resource_allocation_index(G, [(i, j)]))[0][2]
    
    # jaccard coefficient
    jc = list(nx.jaccard_coefficient(G, [(i, j)]))[0][2]
    
    # adamic adar index
    aa = list(nx.adamic_adar_index(G, [(i, j)]))[0][2]
    
    # preferential attachment
    pa = list(nx.preferential_attachment(G, [(i, j)]))[0][2]
    
    # #common neighbors soundarajan hopcroft
    # sh = list(nx.cn_soundarajan_hopcroft(G, [(i, j)]))[0][2]

    # #ra index soundarajan hopcroft
    # rai = list(nx.ra_index_soundarajan_hopcroft(G, [(i, j)]))[0][2]

    # #within inter cluster
    # wic = list(nx.within_inter_cluster(G, [(i, j)]))[0][2]

    # amount of common neighbors
    cn = len(list(nx.common_neighbors(G, i, j)))

    # check if the nodes are in the same cluster/same attribute
    att_same = 1 if att[i][1] == att[j][1] else 0

    # nx.shortest_path_length(G, i, j)

    #node based features -> could be worth a try
    degree_i = G.degree(i)
    degree_j = G.degree(j)
    clustering_coeff_i = nx.clustering(G, i)
    clustering_coeff_j = nx.clustering(G, j)
    # betweenness_centrality_i = nx.betweenness_centrality(G)[i]
    # betweenness_centrality_j = nx.betweenness_centrality(G)[j]

    # Edge betweenness centrality
    # edge_betweenness = nx.edge_betweenness_centrality(G)[(i, j)]


    katz_centrality_i = nx.katz_centrality_numpy(G)[i]
    katz_centrality_j = nx.katz_centrality_numpy(G)[j]

    avg_neighbor_degree_i = nx.average_neighbor_degree(G)[i]
    avg_neighbor_degree_j = nx.average_neighbor_degree(G)[j]

    closeness_centrality_i = nx.closeness_centrality(G, i)
    closeness_centrality_j = nx.closeness_centrality(G, j)

    pagerank_i = nx.pagerank(G)[i]
    pagerank_j = nx.pagerank(G)[j]


    # Add the new features to the return list
    return [ra, jc, aa, pa, cn, att_same, degree_i, degree_j, clustering_coeff_i, clustering_coeff_j, 
            katz_centrality_i, katz_centrality_j, avg_neighbor_degree_i, avg_neighbor_degree_j, 
            closeness_centrality_i, closeness_centrality_j, pagerank_i, pagerank_j]


In [8]:
# check if att_same works
att[2][1] == att[0][1]

False

the idea is to create features for all current 

In [9]:
X = []
y = []

for (i, j) in G.edges:
    X.append(getFeatures(G, i, j))
    y.append(1)
    print(i,j)
solInput = pd.read_csv('data/solutionInput.csv')

# set length of 0s to modify ratio (currently set to 1:1)
length_to_modify_ratio = len(X)


for kk in range(length_to_modify_ratio):
    print("0: " + str(kk))
    #set possible i and j ranges
    possibleiandj = len(att)

    i = np.random.randint(possibleiandj)
    j = np.random.randint(possibleiandj)

    # check if edge already exists -> should be yes
    # check if i and j are the same -> no edge to itself
    # check if i and j are in the solution set/x_test -> should not be set to anything to avoid distorting the solution
    while (i, j) in G.edges or i == j or (i in solInput['int1'] and j in solInput['int2']):
        i = np.random.randint(possibleiandj)
        j = np.random.randint(possibleiandj)

    X.append(getFeatures(G, i, j))
    y.append(0)

0 5
0 6
0 7
0 8
0 9
0 12
0 13
0 14
0 16
0 17
0 18
0 21
0 22
0 23
0 26
0 34
0 35
0 40
0 48
0 55
0 57
0 63
0 71
0 81
0 103
0 113
0 115
0 118
0 120
0 124
0 130
0 161
0 162
0 173
0 177
0 178
0 179
0 204
0 213
0 232
0 239
0 242
0 247
0 112
0 218
0 76
0 102
0 323
5 2
5 3
5 4
5 6
5 7
5 8
5 9
5 10
5 12
5 13
5 15
5 17
5 20
5 22
5 28
5 32
5 35
5 36
5 43
5 47
5 53
5 60
5 61
5 70
5 75
5 76
5 77
5 83
5 85
5 86
5 90
5 94
5 97
5 98
5 100
5 101
5 107
5 108
5 113
5 114
5 115
5 116
5 119
5 125
5 126
5 133
5 135
5 139
5 147
5 153
5 162
5 164
5 165
5 174
5 186
5 201
5 203
5 232
5 239
5 241
5 23
5 561
5 521
5 266
5 769
6 1
6 4
6 11
6 15
6 38
6 40
6 42
6 70
6 120
6 122
6 150
6 168
6 202
6 288
6 190
6 1105
6 668
6 858
7 2
7 3
7 4
7 8
7 9
7 11
7 14
7 20
7 21
7 23
7 26
7 27
7 34
7 38
7 39
7 42
7 50
7 52
7 53
7 55
7 57
7 58
7 61
7 66
7 69
7 75
7 77
7 78
7 82
7 89
7 97
7 102
7 110
7 113
7 119
7 124
7 125
7 126
7 131
7 132
7 134
7 136
7 145
7 146
7 152
7 154
7 159
7 168
7 172
7 177
7 179
7 182
7 183
7 189
7 193
7

In [10]:
num_rows = len(X)
num_cols = len(X[0]) if X else 0
print(f"Dimensions of features: {num_rows} rows, {num_cols} columns")

Dimensions of features: 13200 rows, 18 columns


In [11]:
X

[[1.5303718734300502,
  0.16326530612244897,
  6.708297891408716,
  3168,
  16,
  0,
  48,
  66,
  0.06560283687943262,
  0.05407925407925408,
  0.09353853992444142,
  0.15228983795111145,
  13.625,
  13.06060606060606,
  0.30566884176182707,
  0.3289444810182137,
  0.003190576777758001,
  0.004382807993236798],
 [0.22348484848484845,
  0.046153846153846156,
  1.1220111034358629,
  960,
  3,
  0,
  48,
  20,
  0.06560283687943262,
  0.06842105263157895,
  0.09353853992444142,
  0.026309583404888037,
  13.625,
  17.2,
  0.30566884176182707,
  0.2957774269928966,
  0.003190576777758001,
  0.0013673840197808746],
 [1.4982719177841128,
  0.14423076923076922,
  6.434748690039283,
  3408,
  15,
  0,
  48,
  71,
  0.06560283687943262,
  0.04024144869215292,
  0.09353853992444142,
  0.1718268054743426,
  13.625,
  12.014084507042254,
  0.30566884176182707,
  0.3296679129096107,
  0.003190576777758001,
  0.004709368875774116],
 [0.352569355527102,
  0.09523809523809523,
  2.054963393280795,
  1

In [12]:
X_kaggle = []
X_kaggle = [getFeatures(G, i, j) for i, j in zip(solInput['int1'], solInput['int2'])]
X_kaggle

[[0,
  0.0,
  0,
  30,
  0,
  0,
  6,
  5,
  0.06666666666666667,
  0.3,
  0.010393788558512175,
  0.00878733845188821,
  24.666666666666668,
  20.4,
  0.2839015151515151,
  0.25782593739250087,
  0.000477867202719278,
  0.0004005662952779569],
 [0.18553459119496854,
  0.04878048780487805,
  0.809981275302947,
  222,
  2,
  1,
  37,
  6,
  0.028528528528528527,
  0.13333333333333333,
  0.0634541737961876,
  0.005075306942475389,
  12.972972972972974,
  18.666666666666668,
  0.3115128844555278,
  0.2504594820384294,
  0.002456332120917324,
  0.0004936296890873402],
 [0,
  0.0,
  0,
  49,
  0,
  0,
  7,
  7,
  0.047619047619047616,
  0.047619047619047616,
  -0.0055149980326188106,
  -0.015152808605859982,
  9.571428571428571,
  7.571428571428571,
  0.2579590431939425,
  0.22145073127492984,
  0.0005722683420206741,
  0.0005734477595520348],
 [0,
  0.0,
  0,
  105,
  0,
  0,
  7,
  15,
  0.09523809523809523,
  0.08571428571428572,
  -0.008916295179344919,
  0.027465583500374993,
  9.85714

In [13]:
pd.DataFrame(X).to_csv('data/x.csv', index=False)
pd.DataFrame(y).to_csv('data/y.csv', index=False)
pd.DataFrame(X_kaggle).to_csv('data/X_kaggle.csv', index=False)