In [1]:
pip install snap-stanford

Collecting snap-stanford
  Downloading snap_stanford-6.0.0-cp38-cp38-win_amd64.whl (9.2 MB)
     ---------------------------------------- 9.2/9.2 MB 1.4 MB/s eta 0:00:00
Installing collected packages: snap-stanford
Successfully installed snap-stanford-6.0.0
Note: you may need to restart the kernel to use updated packages.




In [2]:
import snap
import pandas as pd
from datetime import datetime

In [6]:
file_path = "soc-redditHyperlinks-body.tsv"
df = pd.read_csv(file_path, sep='\t')

def preprocess_data(df):
    # Convert TIMESTAMP column to datetime format
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

    # Create a new column for the year
    df['Year'] = df['TIMESTAMP'].dt.year

    return df

In [18]:
def preprocess_data(df):
    # Convert TIMESTAMP column to datetime format
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])

    # Create a new column for the year
    df['Year'] = df['TIMESTAMP'].dt.year

    return df

def create_graph(df, year):
    # Filter data for the specified year
    year_df = df[df['Year'] == year]

    # Create a directed graph
    G = snap.TNGraph.New()

    # Add nodes and edges to the graph
    for index, row in year_df.iterrows():
        source_node = row['SOURCE_SUBREDDIT']
        target_node = row['TARGET_SUBREDDIT']
        
        # Convert node IDs to integers. Hashfunction handles large integer values
        source_node_id = hash(source_node) % (2**31 - 1)
        target_node_id = hash(target_node) % (2**31 - 1)

      # Add nodes if not already present
        if not G.IsNode(int(source_node_id)):
            G.AddNode(int(source_node_id))
        if not G.IsNode(int(target_node_id)):
            G.AddNode(int(target_node_id))

        # Add directed edge
        G.AddEdge(int(source_node_id), int(target_node_id))

    return G

# Preprocess data
df = preprocess_data(df)

# Specify the range of years in your dataset
start_year = 2014
end_year = 2017


for year in range(start_year, end_year + 1):
    # Create a graph for the specified year
    graph = create_graph(df, year)
    
    # Save the graph to a file (optional)
    snap.SaveEdgeList(graph, f"graph_{year}.txt", "Graph for Year " + str(year))

In [22]:
# PRUNING

import snap
import numpy as np

# Function to prune the graph based on degree distribution
def prune_graph(graph, percentile):
    # Get the degrees of all nodes in the graph
    degrees = [node.GetOutDeg() for node in graph.Nodes()]
    
    # Calculate the threshold based on the specified percentile
    threshold = np.percentile(degrees, percentile)
    
    # Identify nodes to remove based on the threshold
    nodes_to_remove = [node.GetId() for node in graph.Nodes() if node.GetOutDeg() < threshold]
    
    # Remove identified nodes
    for node_id in nodes_to_remove:
        graph.DelNode(node_id)
    
    return graph

# Function to load graph from edge list text file

def load_graph_from_edge_list(file_path):
    # Create an empty directed graph
    graph = snap.TNGraph.New()
    
    # Load edges from the text file
    with open(file_path, 'r') as file:
        for line in file:
            # Skip lines starting with '#'
            if line.startswith('#'):
                continue
            
            source_node, target_node = map(int, line.strip().split('\t'))
            
            # Add nodes if not already present
            if not graph.IsNode(source_node):
                graph.AddNode(source_node)
            if not graph.IsNode(target_node):
                graph.AddNode(target_node)
            
            # Add directed edge
            graph.AddEdge(source_node, target_node)
    
    return graph


# Prune graphs for each year
pruned_graphs = {}
for year in range(start_year, end_year + 1):
    # Load the graph from the edge list file
    file_path = f"graph_{year}.txt"
    graph = load_graph_from_edge_list(file_path)
    
    # Prune the graph - 5th percentile threshold
    pruned_graph = prune_graph(graph, 5)  
    pruned_graphs[year] = pruned_graph