# Import Package

In [1]:
import pandas as pd
import numpy as np
import random
import igraph as ig
from sklearn.linear_model import LogisticRegressionCV

In [2]:
class TranditionalPredict():   
    def __init__(self, G):
        self.G = G
    '''calucate disconnected pairs'''
    def disconnected_node_pairs(self):
        possible_node_pairs = list()
        adjacent_matrix = self.G.get_adjacency()
        for i in range(adjacent_matrix.shape[0]):
            for j in range(adjacent_matrix.shape[1]):
                if i != j and adjacent_matrix[i][j] == 0:
                    possible_node_pairs.append((i, j))
        return possible_node_pairs

    def similarity_score(self, method, vertice_a, vertice_b):
        # vertice neighbors
        a_neighbors = set(self.G.neighborhood(vertice_a)) # - {vertice_a}
        b_neighbors = set(self.G.neighborhood(vertice_b)) # - {vertice_b}
        # neighbor intersection and union
        neighbor_intersection = a_neighbors.intersection(b_neighbors)
        neighbor_union = a_neighbors.union(b_neighbors)
        '''similarity method'''
        switcher = {
            # Local Neighborhood
            'common neighbors' : lambda : len(neighbor_intersection),
            'jaccard coefficient' : lambda : len(neighbor_intersection) / len(neighbor_union), # if len(neighbor_union) > 0 else 0,
            'adamic' : lambda: sum( [ 0 if self.G.degree == [] else (1.0/np.log(self.G.degree(vertice)) if self.G.degree(vertice) > 1 else 1.0/np.log(self.G.degree(vertice)+0.5)) for vertice in neighbor_intersection ] ),
            # shortest path
            'shortest path' : lambda: -min(self.G.shortest_paths(vertice_a, vertice_b)[0]),           
            # combine function
            'preferential attachment' : lambda: self.G.degree(vertice_a) * self.G.degree(vertice_b),
            'clustering coefficient' : lambda: self.G.transitivity_local_undirected(vertice_a, mode='zero') * self.G.transitivity_local_undirected(vertice_b, mode='zero')
        }
        function = switcher.get(method, lambda : 'method name invalid')
        return function()
    
    def feature_extraction(self, method_list, df, testing=False):
        feature_list = list()
        for index, row in df.iterrows():
            if testing:
                v_a, v_b = row
            else:
                v_a, v_b, label = row
             # similarrity score
            feature_values = { method:self.similarity_score(method, v_a, v_b) for method in method_list }
            feature_list.append(feature_values)
        return feature_list

# Load Data

In [3]:
if __name__ == '__main__':
    node_pairs_df = pd.read_csv("data_train_edge.csv")
    test_data_df = pd.read_csv("predict.csv")

# Create Graph

In [4]:
    node_num = 1005
    node_pairs = [ pair for pair in zip(node_pairs_df['node1'], node_pairs_df['node2'])]
    current_snapshot = ig.Graph()
    current_snapshot.add_vertices(node_num)
    current_snapshot.add_edges(node_pairs)
    print(current_snapshot)

IGRAPH U--- 1005 19943 --
+ edges:
   0 --    1    5    5    6    6   17   18   18   73   73   74   74   88   88
101  103  103  120  146  146  148  166  166  177  178  178  215  215  218  218
221  222  222  223  226  238  248  248  250  250  283  309  309  313  316  316
377  377  380  459  459  498  498  560  560  581  734
   1 --    0   17   21   74   82   84  106  121  127  142  146  147  187  199
221  222  224  225  232  250  254  255  268  280  284  310  316  351  368  377
450  459  537  548  549  560  568  641  726  979
   2 --    3    3    4    4    5    5    6    6   13   54   55   55   56   56
57   58   59   59   62   63   63   64   86   89   89   96  102  102  107  114
126  132  137  137  138  138  140  158  158  160  160  162  164  166  174  174
192  193  195  195  208  211  211  238  238  249  249  252  252  271  273  273
281  281  283  283  285  285  286  286  300  300  305  305  308  323  323  405
408  408  409  411  411  412  412  434  434  481  481  489  489  494  506  5

# Labeling training data

In [5]:
    seed = 42
    sample_rate = 0.2
    random.seed(seed)
    # sampling edges(postive)
    positive_instance = random.sample(current_snapshot.get_edgelist(), int(current_snapshot.ecount()*sample_rate))
    
    previous_snapshot = current_snapshot.copy()
    # remove postive edges
    previous_snapshot.delete_edges(positive_instance)
    # calculate no edge pairs
    tp = TranditionalPredict(previous_snapshot)
    no_edge_pairs = tp.disconnected_node_pairs()
    # sampling edges(negative)
    no_edge_pairs_df = pd.DataFrame(no_edge_pairs, columns=['node1', 'node2'])
    # labeling
    no_edge_pairs_df['label'] = [ 1 if tuple(node_pair) in positive_instance else 0 for index, node_pair in no_edge_pairs_df.iterrows() ]
    
    negative_num, positive_num = no_edge_pairs_df.label.value_counts()
    print(negative_num, positive_num)
    positive_df = no_edge_pairs_df[no_edge_pairs_df['label'] == 1]
    negative_df = no_edge_pairs_df[no_edge_pairs_df['label'] == 0]
    
    negative_df = negative_df.sample(positive_num, replace=True)
    dataset_df = positive_df.append(negative_df)
    positive_num, negative_num = dataset_df.label.value_counts()
    print(positive_num, negative_num)

982207 1753
1753 1753


# Calculate feature value

In [6]:
    feature_df =  pd.DataFrame(columns=['common neighbors', 'jaccard coefficient', 'shortest path', 'preferential attachment', 'adamic']) # clustering coefficient', 'adamic'
    feature_list = tp.feature_extraction(feature_df.columns.to_list(), dataset_df)
    feature_df = feature_df.append(feature_list)
    feature_df['label'] = dataset_df['label'].to_list()
    feature_df.tail()

Unnamed: 0,common neighbors,jaccard coefficient,shortest path,preferential attachment,adamic,label
3501,9,0.078261,-2.0,3600,1.834222,0
3502,2,0.043478,-2.0,539,0.520628,0
3503,0,0.0,-4.0,10,0.0,0
3504,0,0.0,-3.0,570,0.0,0
3505,0,0.0,-3.0,110,0.0,0


# Data cleaning

In [7]:
    feature_df.replace(-np.inf, 0, inplace=True)
    feature_df.tail()

Unnamed: 0,common neighbors,jaccard coefficient,shortest path,preferential attachment,adamic,label
3501,9,0.078261,-2.0,3600,1.834222,0
3502,2,0.043478,-2.0,539,0.520628,0
3503,0,0.0,-4.0,10,0.0,0
3504,0,0.0,-3.0,570,0.0,0
3505,0,0.0,-3.0,110,0.0,0


# Training model with Logistic Regression

## trainning

In [8]:
    X_train, y_train = feature_df.iloc[:, :feature_df.shape[1]-1].to_numpy(), feature_df['label']
    logistic_regression = LogisticRegressionCV(cv=10, random_state=seed)
    logistic_regression.fit(X_train,y_train)
    
    predictions = logistic_regression.predict(X_train)
    accuracy = logistic_regression.score(X_train, y_train)
    print(accuracy)

0.8462635482030805


# Testing

In [9]:
    test_data_df.head()

Unnamed: 0,node1,node2
0,449,134
1,762,435
2,18,642
3,358,264
4,69,108


In [10]:
    tp = TranditionalPredict(current_snapshot)
    test_feature_df = pd.DataFrame(columns=('common neighbors', 'jaccard coefficient',  'shortest path', 'preferential attachment', 'adamic')) # 'clustering coefficient', 'adamic'
    test_feature_list = tp.feature_extraction(test_feature_df.columns.to_list(), test_data_df, testing=True)

In [11]:
    test_feature_df = test_feature_df.append(test_feature_list)
    test_feature_df.tail()

Unnamed: 0,common neighbors,jaccard coefficient,shortest path,preferential attachment,adamic
9995,15,0.15625,-1.0,5950,3.220475
9996,2,0.019608,-2.0,1380,0.363324
9997,8,0.163265,-1.0,1482,1.969086
9998,0,0.0,-3.0,484,0.0
9999,0,0.0,-inf,0,0.0


# Data cleaning

In [12]:
    test_feature_df.replace(-np.inf, 0, inplace=True)
    test_feature_df.tail()

Unnamed: 0,common neighbors,jaccard coefficient,shortest path,preferential attachment,adamic
9995,15,0.15625,-1.0,5950,3.220475
9996,2,0.019608,-2.0,1380,0.363324
9997,8,0.163265,-1.0,1482,1.969086
9998,0,0.0,-3.0,484,0.0
9999,0,0.0,0.0,0,0.0


# Prediction

In [13]:
    X_test = test_feature_df.to_numpy()
    predictions = logistic_regression.predict(X_test)

In [14]:
    result = pd.DataFrame(columns=('predict_nodepair_id', 'ans'))
    ans_list = list()
    for index, ans in enumerate(predictions):
        res = {
            'predict_nodepair_id': index,
            'ans': ans
        }
        ans_list.append(res)
    result = result.append(ans_list)

In [15]:
    result.tail()

Unnamed: 0,predict_nodepair_id,ans
9995,9995,1
9996,9996,0
9997,9997,1
9998,9998,0
9999,9999,0


In [16]:
    result.to_csv('link_prediction_ans.csv', index=False)