In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
from node2vec import Node2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
# loading data

# load users
with open("users.csv") as f:
    users = f.read().splitlines() 
    users = users[1:]
# load train data
with open("train.csv") as f:
    links = f.read().splitlines() 
    links = links[1:]
# load test data
with open("test.csv") as f:
    tests = f.read().splitlines() 
    tests = tests[1:]
# taking a sample of the data
#train data sample
links_sample = random.sample(links, 20000)
#adjacancy matrix user sample
users_sample = random.sample(users, 1500)

In [3]:
# captturing nodes in 2 separate lists
node_list_1 = []
node_list_2 = []
for i in tqdm(links_sample):
    node_list_1.append(i.split(',')[0])
    node_list_2.append(i.split(',')[1])
#creating the data frame
df = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})

100%|██████████| 20000/20000 [00:00<00:00, 801211.85it/s]


In [4]:
# creating the graph

G = nx.from_pandas_edgelist(df, "node_1", "node_2", create_using=nx.Graph())

In [5]:
# combine all nodes in a list to idetify the users
node_list = node_list_1 + node_list_2
# remove duplicate items from the list
node_list = list(dict.fromkeys(node_list))
# build adjacency matrix
adj_G = nx.to_numpy_matrix(G, nodelist = node_list)

In [6]:
# get unconnected node-pairs
all_unconnected_pairs = []

# traverse adjacency matrix
offset = 0
for i in tqdm(range(adj_G.shape[0])):
    for j in range(offset,adj_G.shape[1]):
        if i != j and j > i:
            if adj_G[i,j] == 0:
                all_unconnected_pairs.append([node_list[i],node_list[j]])
    offset = offset + 1

100%|██████████| 17553/17553 [05:17<00:00, 55.24it/s]  


In [7]:
#creating a data frame to train the model with it
#choosing only 30,000 negative data
all_unconnected_pairs = random.sample(all_unconnected_pairs, 30000)
node_1_unlinked = [i[0] for i in all_unconnected_pairs]
node_2_unlinked = [i[1] for i in all_unconnected_pairs]

data = pd.DataFrame({'node_1':node_1_unlinked, 
                     'node_2':node_2_unlinked})

# add target variable 'link'
data['link'] = 0

In [8]:
#adding positive data to the training data
df_ghost = df

# add the target variable 'link'
df_ghost['link'] = 1

data = data.append(df_ghost[['node_1', 'node_2', 'link']], ignore_index=True)


In [9]:
#the count of each value. 
data['link'].value_counts()

0    30000
1    20000
Name: link, dtype: int64

In [10]:
#creating a graph with the training data
G_data = nx.from_pandas_edgelist(df, "node_1", "node_2", create_using=nx.Graph())

In [11]:
# feature selection
# Generate walks
node2vec = Node2Vec(G_data, dimensions=100, walk_length=8, num_walks=10)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)


Computing transition probabilities: 100%|██████████| 17553/17553 [00:05<00:00, 3027.13it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:52<00:00,  5.25s/it]


In [12]:
x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['node_1'], data['node_2'])]

  """Entry point for launching an IPython kernel.


In [13]:
xtrain = np.array(x)
ytrain = data['link']

In [14]:
#creating and training the model
lr = LogisticRegression(class_weight="balanced")

lr.fit(xtrain, ytrain)




LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
#selecting 50,000 test data
tests = random.sample(tests, 50000)

# captture nodes in 2 separate lists
node_list_1 = []
node_list_2 = []
node_list_3 = []

# creating data frame
for i in tqdm(tests):
    node_list_1.append(i.split(',')[0])
    node_list_2.append(i.split(',')[1])
    node_list_3.append(i.split(',')[2])

test_df = pd.DataFrame({'node_1':node_list_1, 
                     'node_2':node_list_2})
test_data = test_df
test_data['link'] = node_list_3


100%|██████████| 50000/50000 [00:00<00:00, 675711.59it/s]


In [16]:
T_data = nx.from_pandas_edgelist(test_df, "node_1", "node_2", create_using=nx.Graph())

In [17]:
#feature selection
# Generate walks
node2vec = Node2Vec(T_data, dimensions=100, walk_length=8, num_walks=10)

n2w_model = node2vec.fit(window=7, min_count=1)



Computing transition probabilities: 100%|██████████| 62287/62287 [00:25<00:00, 2397.73it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [04:20<00:00, 26.09s/it]


In [18]:
x_test = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(test_data['node_1'], test_data['node_2'])]

  """Entry point for launching an IPython kernel.


In [19]:
xtest = np.array(x_test)
ytest = []
for i in test_data['link']:
    ytest.append(int(i))

In [20]:
#predicting
predictions = lr.predict_proba(xtest)

In [21]:
#rounding the probabilities to 0 and 1
rounded_predictions = []
for i in predictions[:,1]:
    if i > 0.5:
        rounded_predictions.append(int(1))
    else:
        rounded_predictions.append(int(0))


In [22]:
# calculating recall
recall_score(ytest, rounded_predictions, average='macro')

0.5198271291519619

In [23]:
# calculating precision
precision_score(ytest, rounded_predictions, average='macro')

0.8978167898464231

In [24]:
# calculating f1 
f1_score(ytest, rounded_predictions, average='macro')

0.4944454390404074