Task: Predict whether a user will vote on another user's postings

Methods: Similarity based techniques

**Considered datasets:**
* Network of Users Who Vote on Postings (undirected) - df_edge_list_undirected_users_votes_to_postings_net

**Additional datasets:**
* df_Postings_filtered_net
* df_Votes_filtered_net 

### Importing libraries

In [1]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import scipy
import random
import pickle
import itertools
import numpy as np

### Opening the graph and subsetting it

In [2]:
# Load the graph from dataset preparation
with open("graph.pkl", "rb") as f:
    G_undirected = pickle.load(f)

# Verify the graph is loaded
print(f"Graph: {G_undirected.number_of_nodes()} nodes, {G_undirected.number_of_edges()} edges")

Graph: 10964 nodes, 529434 edges


In [3]:
# Sample a smaller subgraph with 1000 nodes
subgraph = G_undirected.subgraph(list(G_undirected.nodes)[:1000])
graph = subgraph

### Define helper functions

In [4]:
# Function to generate non-existent edges for link prediction
def non_edges(graph):
    return [(u, v) for u, v in itertools.combinations(graph.nodes, 2) if not graph.has_edge(u, v)]

# Function to calculate similarity scores
def calculate_similarity_scores(graph, edges, method):
    if method == 'common_neighbors':
        return [len(list(nx.common_neighbors(graph, u, v))) for u, v in edges]
    elif method == 'jaccard':
        return [p for u, v, p in nx.jaccard_coefficient(graph, edges)]
    elif method == 'katz':
        centrality = nx.katz_centrality_numpy(graph, alpha=0.005)
        return [(centrality[u] * centrality[v]) for u, v in edges]
    elif method == 'simrank':
        sim = nx.simrank_similarity(graph)
        return [sim[u][v] for u, v in edges]

### Data preparation (train set, test set)

In [5]:
# Generate positive and negative examples
existing_edges = list(graph.edges)
nonexistent_edges = non_edges(graph)
selected_nonexistent = random.sample(nonexistent_edges, len(existing_edges))  # Balance the classes

# Create edge labels: 1 if edge exists, 0 otherwise
edges = existing_edges + selected_nonexistent
labels = [1] * len(existing_edges) + [0] * len(selected_nonexistent)

# Split data into train and test sets
edges_train, edges_test, labels_train, labels_test = train_test_split(edges, labels, test_size=0.3, random_state=42)


### Run all methods and evaluate them

In [6]:
methods = ['common_neighbors', 'jaccard', 'katz', 'simrank']
results = {}

for method in methods:
    # Calculate similarity scores for training data
    scores_train = calculate_similarity_scores(graph, edges_train, method)
    
    # Train logistic regression model
    model = LogisticRegression()
    model.fit(np.array(scores_train).reshape(-1, 1), labels_train)
    
    # Calculate similarity scores for test data
    scores_test = calculate_similarity_scores(graph, edges_test, method)
    
    # Predictions and evaluation
    predictions = model.predict_proba(np.array(scores_test).reshape(-1, 1))[:, 1]
    roc_auc = roc_auc_score(labels_test, predictions)
    accuracy = accuracy_score(labels_test, (predictions > 0.5).astype(int))
    
    results[method] = {'ROC-AUC': roc_auc, 'Accuracy': accuracy}

for method, metrics in results.items():
    print(f"{method} ->  [ROC-AUC]: {metrics['ROC-AUC']}, [Accuracy]: {metrics['Accuracy']}")


common_neighbors ->  [ROC-AUC]: 0.8379363855755232, [Accuracy]: 0.7424709509129713
jaccard ->  [ROC-AUC]: 0.7886826580999341, [Accuracy]: 0.7155797960635523
katz ->  [ROC-AUC]: 0.8389445712348984, [Accuracy]: 0.6782072563433721
simrank ->  [ROC-AUC]: 0.5602767409057557, [Accuracy]: 0.583542802940479
