## 1. 사용할 패키지 불러오기

In [1]:
import networkx as nx
import csv
import numpy as np
from random import randint
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost
from sklearn.ensemble import RandomForestClassifier

## 2. 데이터 불러오기

### (1) Node feature

In [19]:
node_feature = pd.read_csv('node_feature.txt', sep=" ",header=None)
node_feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.077034,-0.065027,-0.135552,0.007305,-0.019618,-0.129522,-0.394962,-0.096774,-0.549495,-0.22968,...,0.329416,0.634487,-0.183202,0.280506,0.017737,-0.141899,0.300271,-0.148616,0.042472,-0.092819
1,0.090406,-0.055498,-0.199454,-0.073101,0.12998,-0.168792,-0.4983,0.012366,-0.517318,-0.3448,...,0.262,0.534585,-0.284113,0.193362,-0.017993,0.133143,0.415925,-0.20551,-0.057315,-0.221824
2,0.036018,-0.276833,-0.006313,0.365836,-0.054446,-0.388111,-0.310889,-0.128666,-0.691407,-0.319269,...,0.416217,0.616074,-0.175263,0.12696,0.04885,0.135687,0.397666,0.022077,0.115069,-0.050359
3,0.103949,0.141362,-0.2348,0.038294,0.029006,-0.042764,-0.260278,-0.073697,-0.443563,-0.339663,...,0.144633,0.384818,-0.301781,0.235658,-0.114685,0.138715,0.108334,-0.113345,-0.067795,-0.181579
4,-0.031268,0.008753,-0.352797,-0.175202,0.124375,-0.166064,-0.467233,-0.056786,-0.466632,-0.372931,...,0.415152,0.578069,-0.262672,0.017259,-0.017925,0.197143,0.425531,-0.134342,0.07345,-0.063862


### (2) Train edges

In [16]:
G = nx.read_edgelist('train_edges.txt', delimiter=' ', create_using=nx.DiGraph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 12588
Number of edges: 14322


### (3) Test edges

In [17]:
test_edges = pd.read_csv('unlabeled_edges.txt', sep=" ",header=None)
test_edges.columns = ['node1', 'node2']
test_edges.head()

Unnamed: 0,node1,node2
0,11468,1677
1,3508,8904
2,6724,2318
3,11662,9673
4,2035,3693


## 3. Node feature와 Graph 관련 Feature를 추가하여, 데이터 생성

Node Feature 정보를 활용하여, 각 Column에 대한 더한 값과 차이의 값을 Feature로 추가한다.

또한 Graph 정보는 두 노드의 degree와 in_degree의 더한 값과 차이의 값을 Feature로 추가한다.

#### - Train Dataset 생성

In [20]:
X_train = np.zeros((2*m, 68))
y_train = np.zeros(2*m)

for i,edge in enumerate(G.edges()):
    # an edge
    for col in range(node_feature.shape[1]):
        X_train[i,2*col] = node_features[col][edge[0]] + node_features[col][edge[1]]
        X_train[i,2*col+1] = abs(node_features[col][edge[0]] - node_features[col][edge[1]])
    
    X_train[i,64] = G.degree(edge[0]) + G.degree(edge[1]) 
    X_train[i,65] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[i,66] = G.in_degree(edge[0]) + G.in_degree(edge[1])
    X_train[i,67] = abs(G.in_degree(edge[0]) - G.in_degree(edge[1]))
    
    y_train[i] = 1
    
    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    
    #an edge
    for col in range(node_feature.shape[1]):
        X_train[m + i,2*col] = node_features[col][n1] + node_features[col][n2]
        X_train[m + i,2*col+1] = abs(node_features[col][n1] - node_features[col][n2])
    
    X_train[m+i,64] = G.degree(edge[0]) + G.degree(edge[1]) 
    X_train[m+i,65] = abs(G.degree(edge[0]) - G.degree(edge[1]))
    X_train[m+i,66] = G.in_degree(edge[0]) + G.in_degree(edge[1])
    X_train[m+i,67] = abs(G.in_degree(edge[0]) - G.in_degree(edge[1]))

In [22]:
print('Size of training matrix:', X_train.shape)

Size of training matrix: (28644, 68)


#### - Test Dataset 생성

In [24]:
X_test = np.zeros((test_edges.shape[0], 68))

for i in range(X_test.shape[0]):
    # an edge
    for col in range(node_feature.shape[1]):
        X_test[i,2*col] = node_features[col][test_edges['node1'][i]] + node_features[col][test_edges['node2'][i]]
        X_test[i,2*col+1] = abs(node_features[col][test_edges['node1'][i]] - node_features[col][test_edges['node2'][i]])
    
    X_test[i,64] = G.degree(test_edges['node1'][i]) + G.degree(test_edges['node2'][i]) 
    X_test[i,65] = abs(G.degree(test_edges['node1'][i]) - G.degree(test_edges['node2'][i]))
    X_test[i,66] = G.in_degree(test_edges['node1'][i]) + G.in_degree(test_edges['node2'][i])
    X_test[i,67] = abs(G.in_degree(test_edges['node1'][i]) - G.in_degree(test_edges['node2'][i]))

## 4. Train 데이터의 성능 측정을 위해 Validation dataset을 10% 생성

In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=17, stratify = y_train)

## 5. 모델링

### (1) Logistic Regression

In [39]:
# Use logistic regression to predict if two nodes are linked by an edge
logit_model = LogisticRegression(solver='liblinear',random_state=34)
logit_model.fit(X_train, y_train)
valid_pred = logit_model.predict(X_valid)

In [40]:
print(classification_report(y_valid, valid_pred))

              precision    recall  f1-score   support

         0.0       0.68      0.64      0.66      1432
         1.0       0.66      0.70      0.68      1433

    accuracy                           0.67      2865
   macro avg       0.67      0.67      0.67      2865
weighted avg       0.67      0.67      0.67      2865



### (2) Random Forest

In [44]:
RF_model = RandomForestClassifier(n_estimators = 500, max_depth = 5)
RF_model.fit(X_train, y_train)
valid_pred = RF_model.predict(X_valid) 

In [45]:
print(classification_report(y_valid, valid_pred))

              precision    recall  f1-score   support

         0.0       0.71      0.63      0.67      1432
         1.0       0.67      0.74      0.70      1433

    accuracy                           0.69      2865
   macro avg       0.69      0.69      0.68      2865
weighted avg       0.69      0.69      0.68      2865



### (3) XGBoost

In [46]:
xgb_model = xgboost.XGBClassifier(n_estimator = 1000, learning_rate = 0.1, random_state = 100, subsample=0.8)
xgb_model.fit(X_train, y_train)
valid_pred = xgb_model.predict(X_valid)

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [47]:
print(classification_report(y_valid, valid_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.77      0.76      1432
         1.0       0.76      0.73      0.75      1433

    accuracy                           0.75      2865
   macro avg       0.75      0.75      0.75      2865
weighted avg       0.75      0.75      0.75      2865



#### 가장 성능이 좋은 Link prediction 모델은 XGBoost 이다.

## 6. Test에 대한 예측

In [52]:
y_pred = xgb_model.predict_proba(X_test)[:, 1]

In [54]:
test_edges['Linked_probability'] = y_pred
test_edges.to_csv('test_prediction.csv', index = False)