In [1]:
import pandas as pd
import networkx as nx
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import random

In [2]:
folder_path = './data'

In [3]:
# Directory containing your Parquet files
relative_path_to_data = folder_path
data_directory = os.path.abspath(relative_path_to_data)

# Desired file stems to load
# desired_files = {
#     "df_edge_list_undirected_users_postings_replies",
#     "df_edge_list_directed_users_postings_replies",
# }

# Dictionary to store DataFrames with stem names as keys
dataframes = {}

# Iterate over all Parquet files in the directory
for file_path in Path(data_directory).glob("*.parquet"):

    # Extract the file stem (name without extension)
    file_stem = file_path.stem
    
    # Check if the file stem is in the desired list
    # if file_stem in desired_files:
    print(f"Reading file: {file_stem}")
        
        # Read the Parquet file into a DataFrame
    df = pd.read_parquet(file_path)
        
        # Store the DataFrame in the dictionary with the stem as the key
    dataframes[file_stem] = df
        
        # Optionally, break if all desired files are loaded
        # if len(dataframes) == len(desired_files):
        #     break

# Access DataFrames by their file stem
print(dataframes.keys())  # Prints the stems of all loaded files

Reading file: df_edge_list_undirected_users_votes_to_postings_net
Reading file: df_edge_list_directed_users_votes_to_postings_net
dict_keys(['df_edge_list_undirected_users_votes_to_postings_net', 'df_edge_list_directed_users_votes_to_postings_net'])


In [4]:
undirected_votes = dataframes["df_edge_list_undirected_users_votes_to_postings_net"]
directed_votes = dataframes["df_edge_list_directed_users_votes_to_postings_net"]

In [5]:
print("Undirected Replies DataFrame Info:")
print(undirected_votes.info(), "\n")

print("Directed Replies DataFrame Info:")
print(directed_votes.info(), "\n")

Undirected Replies DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5535637 entries, 0 to 5535636
Data columns (total 3 columns):
 #   Column                       Dtype
---  ------                       -----
 0   ID_CommunityIdentity_min     int64
 1   ID_CommunityIdentity_max     int64
 2   count_votes_to_postings_net  int64
dtypes: int64(3)
memory usage: 126.7 MB
None 

Directed Replies DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6105250 entries, 0 to 6105249
Data columns (total 3 columns):
 #   Column                       Dtype
---  ------                       -----
 0   ID_CommunityIdentity_Source  int64
 1   ID_CommunityIdentity_Target  int64
 2   count_votes_to_postings_net  int64
dtypes: int64(3)
memory usage: 139.7 MB
None 



Creating a graph

In [6]:
G_directed = nx.from_pandas_edgelist(
    directed_votes,  # Replace with your DataFrame variable
    source="ID_CommunityIdentity_Source",
    target="ID_CommunityIdentity_Target",
    edge_attr="count_votes_to_postings_net",  # Optional: edge weights
    create_using=nx.DiGraph()
)

G_undirected = nx.from_pandas_edgelist(
    undirected_votes,
    source='ID_CommunityIdentity_min',
    target='ID_CommunityIdentity_max',
    edge_attr="count_votes_to_postings_net",
    create_using=nx.Graph()
)

In [1]:
# Step 1: Extract edges
edges = list(G_directed.edges())

def generate_negative_edges(G, num_samples):
    negative_edges = []
    nodes = list(G.nodes())
    while len(negative_edges) < num_samples:
        u, v = random.sample(nodes, 2)
        if not G.has_edge(u, v) and not G.has_edge(v, u) and (u, v) not in negative_edges:
            negative_edges.append((u, v))
    return negative_edges

# Step 2: Train-test split
train_edges, test_edges = train_test_split(edges, test_size=0.3, random_state=42)
train_edges, val_edges = train_test_split(train_edges, test_size=0.1, random_state=42)

# Step 3: Generate negative edges (non-edges)
nodes = list(G_directed.nodes())
all_possible_edges = set((u, v) for u in nodes for v in nodes if u != v)
existing_edges = set(G_directed.edges()) | set((v, u) for u, v in G_directed.edges())
non_edges = list(all_possible_edges - existing_edges)

# Randomly sample from the non-edges
random.seed(42)
negative_edges = random.sample(non_edges, len(test_edges) + len(val_edges))

# Split negative edges into validation and test
val_edges_false = generate_negative_edges(G_directed, len(val_edges))
test_edges_false = generate_negative_edges(G_directed, len(test_edges))

# Step 4: Create the training graph
G_train = nx.DiGraph()
G_train.add_edges_from(train_edges)

# Print summary
print(f"Total nodes: {G_directed.number_of_nodes()}")
print(f"Total edges in original graph: {len(edges)}")
print(f"Training edges: {len(train_edges)}")
print(f"Validation edges (positive): {len(val_edges)}, Validation edges (negative): {len(val_edges_false)}")
print(f"Test edges (positive): {len(test_edges)}, Test edges (negative): {len(test_edges_false)}")


NameError: name 'G_directed' is not defined

In [None]:
def compute_scores(graph, edges, method):
    if method == 'common_neighbors':
        return [(u, v, len(list(nx.common_neighbors(graph, u, v)))) for u, v in edges]
    elif method == 'jaccard':
        return [(u, v, p) for u, v, p in nx.jaccard_coefficient(graph, edges)]
    elif method == 'katz':
        katz_dict = nx.katz_similarity(graph, alpha=0.005)  # alpha needs to be tuned
        return [(u, v, katz_dict.get((u, v), 0)) for u, v in edges]
    elif method == 'simrank':
        simrank_dict = nx.simrank_similarity(graph)  # consider using a decay factor
        return [(u, v, simrank_dict[u].get(v, 0)) for u, v in edges]
    else:
        raise ValueError("Unknown method")

# Function to calculate AUC scores for predictions
def calculate_auc(graph, edges_pos, edges_neg, method):
    scores_pos = compute_scores(graph, edges_pos, method)
    scores_neg = compute_scores(graph, edges_neg, method)
    labels = [1] * len(scores_pos) + [0] * len(scores_neg)
    scores = [score for _, _, score in scores_pos] + [score for _, _, score in scores_neg]
    return roc_auc_score(labels, scores)

# Example usage
methods = ['common_neighbors', 'jaccard', 'katz', 'simrank']
results = {}
for method in methods:
    auc_val = calculate_auc(G_train, val_edges, val_edges_false, method)
    results[method] = auc_val

# Print results
for method, auc_score in results.items():
    print(f"{method} Validation AUC: {auc_score}")