In [1]:
pip install snap-stanford

Note: you may need to restart the kernel to use updated packages.




In [2]:
import snap
import pandas as pd
from datetime import datetime

In [3]:
file_path = "soc-redditHyperlinks-body.tsv"
df = pd.read_csv(file_path, sep='\t')

def preprocess_data(df):
    # Convert TIMESTAMP column to datetime format
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

    # Create a new column for the year
    df['Year'] = df['TIMESTAMP'].dt.year

    return df

In [4]:
def preprocess_data(df):
    # Convert TIMESTAMP column to datetime format
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

    # Create a new column for the year
    df['Year'] = df['TIMESTAMP'].dt.year

    return df

def create_graph(df, year):
    # Filter data for the specified year
    year_df = df[df['Year'] == year]

    # Create a directed graph
    G = snap.TNGraph.New()

    # Add nodes and edges to the graph
    for index, row in year_df.iterrows():
        source_node = row['SOURCE_SUBREDDIT']
        target_node = row['TARGET_SUBREDDIT']
        
        # Convert node IDs to integers. Hashfunction handles large integer values
        source_node_id = hash(source_node) % (2**31 - 1)
        target_node_id = hash(target_node) % (2**31 - 1)

      # Add nodes if not already present
        if not G.IsNode(int(source_node_id)):
            G.AddNode(int(source_node_id))
        if not G.IsNode(int(target_node_id)):
            G.AddNode(int(target_node_id))

        # Add directed edge
        G.AddEdge(int(source_node_id), int(target_node_id))

    return G

# Preprocess data
df = preprocess_data(df)

# Specify the range of years in your dataset
start_year = 2014
end_year = 2017


for year in range(start_year, end_year + 1):
    # Create a graph for the specified year
    graph = create_graph(df, year)
    
    # Save the graph to a file (optional)
    snap.SaveEdgeList(graph, f"graph_{year}.txt", "Graph for Year " + str(year))

In [5]:
# PRUNING - removing nodes with low out degrees

import snap
import numpy as np

# Function to prune the graph based on degree distribution
def prune_graph(graph, percentile):
    # Get the degrees of all nodes in the graph
    degrees = [node.GetOutDeg() for node in graph.Nodes()]
    
    # Calculate the threshold based on the specified percentile
    threshold = np.percentile(degrees, percentile)
    
    # Identify nodes to remove based on the threshold
    nodes_to_remove = [node.GetId() for node in graph.Nodes() if node.GetOutDeg() < threshold]
    
    # Remove identified nodes
    for node_id in nodes_to_remove:
        graph.DelNode(node_id)
    
    return graph

# Function to load graph from edge list text file

def load_graph_from_edge_list(file_path):
    # Create an empty directed graph
    graph = snap.TNGraph.New()
    
    # Load edges from the text file
    with open(file_path, 'r') as file:
        for line in file:
            
            if line.startswith('#'):
                continue
            
            source_node, target_node = map(int, line.strip().split('\t'))
            
            # Add nodes if not already present
            if not graph.IsNode(source_node):
                graph.AddNode(source_node)
            if not graph.IsNode(target_node):
                graph.AddNode(target_node)
            
            # Add directed edge
            graph.AddEdge(source_node, target_node)
    
    return graph


# Prune graphs for each year
start_year = 2014
end_year = 2017

# Prune graphs for each year and save pruned graphs
pruned_graphs = {}
for year in range(start_year, end_year + 1):
    # Load the graph from the edge list file
    file_path = f"graph_{year}.txt"
    graph = load_graph_from_edge_list(file_path)
    
    # Prune the graph - 5th percentile threshold
    pruned_graph = prune_graph(graph, 5)  
    
    # Save the pruned graph to a file (optional)
    pruned_file_path = f"pruned_graph_{year}.txt"
    snap.SaveEdgeList(pruned_graph, pruned_file_path, f"Pruned Graph for Year {year}")
    
    pruned_graphs[year] = pruned_graph

In [6]:
import snap

# Load pruned graph
def load_pruned_graph_from_edge_list(file_path):
    graph = snap.TNGraph.New()
    all_node_ids = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue

            source_node, target_node = map(int, line.strip().split('\t'))

            if not graph.IsNode(source_node):
                graph.AddNode(source_node)
                all_node_ids.add(source_node)
            if not graph.IsNode(target_node):
                graph.AddNode(target_node)
                all_node_ids.add(target_node)

            graph.AddEdge(source_node, target_node)

    return graph, list(all_node_ids)

# Find overlaps between communities across years for a given node
def find_community_overlaps(communities, node_id):
    community_memberships = {year: set() for year in communities.keys()}

    for year, year_communities in communities.items():
        for community_idx, nodes in year_communities.items():
            if node_id in nodes:
                community_memberships[year].add(community_idx)

    return community_memberships

# Load communities for each year
def load_communities(start_year, end_year):
    communities = {}

    for year in range(start_year, end_year + 1):
        file_path = f"pruned_graph_{year}.txt"
        graph, all_node_ids = load_pruned_graph_from_edge_list(file_path)
        
        communities[year] = algorithm_for_dynamic_detection(graph)

    return communities, all_node_ids

# Dynamic community detection algorithm
def algorithm_for_dynamic_detection(graph, num_iterations=10):
    timestamp_communities = {}

    for iteration in range(num_iterations):
        for edge in graph.Edges():
            u, v, timestamp = edge.GetSrcNId(), edge.GetDstNId(), iteration

            u_community = timestamp_communities.setdefault(timestamp, {}).get(u, u)
            v_community = timestamp_communities.setdefault(timestamp, {}).get(v, v)

            if u_community != v_community:
                u_community_nodes = [node for node, comm in timestamp_communities[timestamp].items() if comm == u_community]
                v_community_nodes = [node for node, comm in timestamp_communities[timestamp].items() if comm == v_community]

                merged_community = min(u_community, v_community)
                for node in u_community_nodes + v_community_nodes:
                    timestamp_communities[timestamp][node] = merged_community

    return timestamp_communities

# Specify the range of years in your dataset
start_year = 2014
end_year = 2017

# Load communities for each year
communities, all_node_ids = load_communities(start_year, end_year)

# Iterate over all nodes and find community overlaps
node_community_overlaps = {}

for node_id in all_node_ids:
    community_overlaps = find_community_overlaps(communities, node_id)
    node_community_overlaps[node_id] = community_overlaps

# Print or process the results
for node_id, overlaps in node_community_overlaps.items():
    for year, communities in overlaps.items():
        print(f"Node {node_id} in Year {year} belongs to communities: {communities}")

# Optionally, you can check for overlaps between communities across years
for community_idx in range(max_community_index):  # Replace with the actual maximum community index
    community_overlap_years = []

    for year, year_communities in communities.items():
        if community_idx in year_communities:
            community_overlap_years.append(year)

    if len(community_overlap_years) > 1:
        print(f"Community {community_idx} overlaps in years: {community_overlap_years}")


Node 2025750531 in Year 2014 belongs to communities: set()
Node 2025750531 in Year 2015 belongs to communities: set()
Node 2025750531 in Year 2016 belongs to communities: set()
Node 2025750531 in Year 2017 belongs to communities: set()
Node 1668546565 in Year 2014 belongs to communities: set()
Node 1668546565 in Year 2015 belongs to communities: set()
Node 1668546565 in Year 2016 belongs to communities: set()
Node 1668546565 in Year 2017 belongs to communities: set()
Node 1832779788 in Year 2014 belongs to communities: set()
Node 1832779788 in Year 2015 belongs to communities: set()
Node 1832779788 in Year 2016 belongs to communities: set()
Node 1832779788 in Year 2017 belongs to communities: set()
Node 1331822620 in Year 2014 belongs to communities: set()
Node 1331822620 in Year 2015 belongs to communities: set()
Node 1331822620 in Year 2016 belongs to communities: set()
Node 1331822620 in Year 2017 belongs to communities: set()
Node 1488224285 in Year 2014 belongs to communities: set

NameError: name 'max_community_index' is not defined

In [18]:
# DOESN't DETECT COMMUNITIES

import snap
import networkx as nx
import community

# Load pruned graph
def load_pruned_graph_from_edge_list(file_path):
    graph = nx.Graph()  # Use undirected graph
    all_node_ids = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue

            source_node, target_node = map(int, line.strip().split('\t'))

            graph.add_edge(source_node, target_node)
            all_node_ids.update([source_node, target_node])

    return graph, list(all_node_ids)

# Load communities for each year using Louvain
def load_communities_louvain(graph):
    partition = community.best_partition(graph)
    communities = {}
    for node, comm_id in partition.items():
        if comm_id not in communities:
            communities[comm_id] = [node]
        else:
            communities[comm_id].append(node)
    return communities

# Load communities for each year
def load_communities(start_year, end_year):
    all_communities = {}
    all_node_ids = set()

    for year in range(start_year, end_year + 1):
        file_path = f"pruned_graph_{year}.txt"
        graph, nodes = load_pruned_graph_from_edge_list(file_path)
        all_node_ids.update(nodes)

        year_communities = load_communities_louvain(graph)
        all_communities[year] = year_communities

    return all_communities, list(all_node_ids)

# Specify the range of years in your dataset
start_year = 2014
end_year = 2017

# Load communities for each year
communities, all_node_ids = load_communities(start_year, end_year)

# Print or process the results
for node_id, overlaps in node_community_overlaps.items():
    for year, communities in overlaps.items():
        print(f"Node {node_id} in Year {year} belongs to communities: {communities}")

# Optionally, you can check for overlaps between communities across years
for community_idx, year_communities in communities.items():
    community_overlap_years = [year for year, comm in year_communities.items() if community_idx in comm]
    if len(community_overlap_years) > 1:
        print(f"Community {community_idx} overlaps in years: {community_overlap_years}")


Node 2025750531 in Year 2014 belongs to communities: set()
Node 2025750531 in Year 2015 belongs to communities: set()
Node 2025750531 in Year 2016 belongs to communities: set()
Node 2025750531 in Year 2017 belongs to communities: set()
Node 1668546565 in Year 2014 belongs to communities: set()
Node 1668546565 in Year 2015 belongs to communities: set()
Node 1668546565 in Year 2016 belongs to communities: set()
Node 1668546565 in Year 2017 belongs to communities: set()
Node 1832779788 in Year 2014 belongs to communities: set()
Node 1832779788 in Year 2015 belongs to communities: set()
Node 1832779788 in Year 2016 belongs to communities: set()
Node 1832779788 in Year 2017 belongs to communities: set()
Node 1331822620 in Year 2014 belongs to communities: set()
Node 1331822620 in Year 2015 belongs to communities: set()
Node 1331822620 in Year 2016 belongs to communities: set()
Node 1331822620 in Year 2017 belongs to communities: set()
Node 1488224285 in Year 2014 belongs to communities: set

AttributeError: 'set' object has no attribute 'items'

In [8]:
pip install python-louvain

Note: you may need to restart the kernel to use updated packages.




In [9]:
pip install python-igraph

Note: you may need to restart the kernel to use updated packages.




In [10]:
pip install --upgrade python-igraph

Note: you may need to restart the kernel to use updated packages.




In [61]:
import snap
import networkx as nx
import community

# Load pruned graph
def load_pruned_graph_from_edge_list(file_path):
    graph = nx.Graph()  # Use undirected graph
    all_node_ids = set()

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue

            source_node, target_node = map(int, line.strip().split('\t'))

            graph.add_edge(source_node, target_node)
            all_node_ids.update([source_node, target_node])

    return graph, list(all_node_ids)

# Load communities for each year using Louvain
def load_communities_louvain(graph):
    partition = community.best_partition(graph)

    
    communities = {}
    for node, comm_id in partition.items():
        if comm_id not in communities:
            communities[comm_id] = [node]
        else:
            communities[comm_id].append(node)
   
    return communities

# Find overlaps between communities across years for a given node
def find_community_overlaps(communities, node_id):
    community_memberships = {year: set() for year in communities.keys()}

    for year, year_communities in communities.items():
        for community_idx, nodes in year_communities.items():
            if node_id in nodes:
                community_memberships[year].add(community_idx)

    return community_memberships

# Load communities for each year
def load_communities(start_year, end_year):
    all_communities = {}
    all_node_ids = set()

    for year in range(start_year, end_year + 1):
        file_path = f"pruned_graph_{year}.txt"
        graph, nodes = load_pruned_graph_from_edge_list(file_path)
        all_node_ids.update(nodes)

        year_communities = load_communities_louvain(graph)
        all_communities[year] = year_communities

    return all_communities, list(all_node_ids)

# Specify the range of years in your dataset
start_year = 2014
end_year = 2017

# Load communities for each year
communities, all_node_ids = load_communities(start_year, end_year)


# Write communities to a file
with open("communities.txt", "w") as communities_file:
    for year, year_communities in communities.items():
        communities_file.write(f"Year {year} communities:\n")
        for community_idx, nodes in year_communities.items():
            communities_file.write(f"Community {community_idx}: {nodes}\n")
        communities_file.write("\n")

# Iterate over all nodes and find community overlaps
node_community_overlaps = {}

for node_id in all_node_ids:
    community_overlaps = find_community_overlaps(communities, node_id)
    node_community_overlaps[node_id] = community_overlaps

with open("output.txt", "w") as output_file:
    for node_id, overlaps in node_community_overlaps.items():
        for year, communities in overlaps.items():
            output_file.write(f"Node {node_id} in Year {year} belongs to communities: {communities}\n")
            
            


<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
{2014: {0: [1212297454, 1026390623, 1090992663, 1490191372, 975418, 822493069, 908766903, 2101183457, 301887371, 373579974, 418122735, 917080128, 981791915, 1315996893, 1350451975, 1466043945, 1702009607, 1760589342, 1804759431, 1943917352, 2057052299, 2078529679, 2093174616, 114393849, 491408099, 1338253157, 1725499285, 2082430798, 1233450543, 1297371971, 1119761365, 1971982672, 77089342, 443943275, 586696866, 317554990, 371134309, 1256302431, 313772683, 340175629, 1041767100, 1085823663, 1114455925, 1428303979, 1579384009, 1645133596, 1679727124, 2127035586, 841983265, 1290025134, 2014699781, 728667361, 756678607, 1964417047, 320918810, 1866529143, 2115381466, 431031587, 1096307006, 1314262029, 1662712344, 2019502136, 1105509041, 1375442413, 1620985648, 1905710945, 278877991, 1264221821, 579408262, 262386635, 1545134526, 414892917, 479033400, 538326699, 543161494, 703931689, 886362847, 1328270258, 1448428331, 1479936127, 162

In [56]:
print(communities)


set()


In [58]:
# Find the maximum community index across all years
max_community_index = max(max(comm_set) if comm_set else 0 for year_communities in communities.values() for comm_set in year_communities.values())


# Optionally, you can check for overlaps between communities across years
with open("output.txt", "a") as output_file:  # Open the file in append mode
    for community_idx in range(max_community_index + 1):
        community_overlap_years = [
            year for year, year_communities in communities.items()
            if any(community_idx in comm_set for comm_set in year_communities.values())
        ]
        if len(community_overlap_years) > 1:
            output_file.write(f"Community {community_idx} overlaps in years: {community_overlap_years}\n")


AttributeError: 'set' object has no attribute 'values'

In [46]:
print(type(communities))

<class 'set'>
