In [19]:
import pandas as pd
import networkx as nx

postitive_edges = pd.read_csv('../../data/edgelist.txt', sep=',', header=None)
postitive_edges.columns = ['source', 'target']

negative_edges = pd.read_csv('../../data/training/negative/non_existing.csv')

G = nx.from_pandas_edgelist(postitive_edges, 'source', 'target')

# Get distance for each pair of non-existing edges, handling no-path cases
def get_distance(row):
    source = row['source']
    target = row['target']
    if G.has_node(source) and G.has_node(target):
        try:
            return nx.shortest_path_length(G, source, target)
        except nx.NetworkXNoPath:
            return float('inf')  # or -1 if you prefer
    else:
        return float('inf')


In [None]:

negative_edges['distance'] = negative_edges.apply(get_distance, axis=1)
negative_edges.head()

In [10]:
negative_edges.drop(['Unnamed: 0'], axis=1, inplace=True, errors='ignore')
negative_edges.head()

Unnamed: 0,source,target,distance
0,25323,54708,4.0
1,72184,108982,5.0
2,113386,4507,5.0
3,11191,128219,5.0
4,28126,85955,3.0


In [11]:
negative_edges.to_csv('../../data/training/negative/negative_edges_node_distance.csv', index=False)

In [13]:
predict_edges = pd.read_csv('../../data/test.txt', sep=',', header=None)
predict_edges.columns = ['source', 'target']

In [16]:
predict_edges['distance'] = predict_edges.apply(get_distance, axis=1)
predict_edges.head()
predict_edges.to_csv('../../data/training/predict/predict_edges_node_distance.csv', index=False)

In [17]:
def get_distance_postive(row):
    source = row['source']
    target = row['target']
    G.remove_edge(source, target)
    res = 0
    try:
        res = nx.shortest_path_length(G, source, target)
    except nx.NetworkXNoPath:
        res = float('inf')  # or -1 if you prefer
    G.add_edge(source, target)
    return res

In [22]:
postitive_edges['distance'] = postitive_edges.apply(get_distance_postive, axis=1)
postitive_edges.head(20)

Unnamed: 0,source,target,distance
0,0,1,6.0
1,0,2,6.0
2,1,3,3.0
3,1,5,2.0
4,1,6,2.0
5,1,7,3.0
6,1,9,2.0
7,1,10,2.0
8,1,11,4.0
9,1,12,2.0


In [23]:
postitive_edges.to_csv('../../data/training/positive/positive_edges_node_distance.csv', index=False)

In [30]:
predict_edges['source_in_degree'] = predict_edges['source'].apply(lambda x: G.degree(x))
predict_edges['target_in_degree'] = predict_edges['target'].apply(lambda x: G.degree(x))
predict_edges.drop('distance', axis=1, inplace=True, errors='ignore')
predict_edges.to_csv('../../data/training/predict/predict_edges_node_in_degree.csv', index=False)
predict_edges.head()


Unnamed: 0,source,target,source_in_degree,target_in_degree
0,34977,59394,62,31
1,22518,46602,7,18
2,36762,22813,16,256
3,44960,110384,72,5
4,29015,26366,28,9


In [26]:
negative_edges['source_in_degree'] = negative_edges['source'].apply(lambda x: G.degree(x))
negative_edges['target_in_degree'] = negative_edges['target'].apply(lambda x: G.degree(x))
negative_edges.drop('distance', axis=1, inplace=True, errors='ignore')
negative_edges.head()
negative_edges.to_csv('../../data/training/negative/negative_edges_node_in_degree.csv', index=False)

In [27]:
negative_edges.head()

Unnamed: 0.1,Unnamed: 0,source,target,source_in_degree,target_in_degree
0,0,25323,54708,30,14
1,1,72184,108982,13,1
2,2,113386,4507,4,45
3,3,11191,128219,27,2
4,4,28126,85955,28,18


In [28]:
postitive_edges['source_in_degree'] = postitive_edges['source'].apply(lambda x: G.degree(x))
postitive_edges['target_in_degree'] = postitive_edges['target'].apply(lambda x: G.degree(x)-1)
postitive_edges.drop('distance', axis=1, inplace=True, errors='ignore')
postitive_edges.head()
postitive_edges.to_csv('../../data/training/positive/positive_edges_node_in_degree.csv', index=False)