In [2]:
import snap

# Create a directed graph
G = snap.TNGraph.New()

with open('processed_post_crosslinks_info.txt', 'r') as file:
    data = file.read()

# Split the data into lines
lines = data.strip().split('\n')

# Mapping between subreddit names and node IDs
subreddit_to_id = {}

# Add nodes and edges to the graph
for line in lines:
    parts = line.split('\t')  # Change the separator to tab
    source_subreddit, target_subreddit = parts[0], parts[1]

    # Add nodes if they don't exist
    if source_subreddit not in subreddit_to_id:
        source_id = G.AddNode()
        subreddit_to_id[source_subreddit] = source_id
    else:
        source_id = subreddit_to_id[source_subreddit]

    if target_subreddit not in subreddit_to_id:
        target_id = G.AddNode()
        subreddit_to_id[target_subreddit] = target_id
    else:
        target_id = subreddit_to_id[target_subreddit]

    # Add a directed edge from source to target subreddit
    G.AddEdge(source_id, target_id)

# Print the total number of nodes and edges
print("Total Nodes:", G.GetNodes())
print("Total Edges:", G.GetEdges())


Total Nodes: 35776
Total Edges: 137821


In [30]:
import snap
import re

# ENCRYPT COZ POST ID CAN ONLY BE INT

def convert_to_integer(post_id):
    base = 36  # 10 digits + 26 letters

    # Remove leading and trailing single quotes if present
    post_id = post_id.strip("'")

    # Convert each character to its corresponding integer value
    int_values = [int(c, base) for c in post_id]

    # Combine the integer values to get a unique integer for the whole string
    result = 0
    for value in int_values:
        result = result * base + value

    return result

# Create a graph
G = snap.TNEANet.New()

# Read data from file
with open('formatted_data_file.txt', 'r') as file:
    data = file.read()

# Split the data into lines
lines = data.strip().split('\n')

# Mapping between community names and their node IDs
community_to_id = {}

# Add nodes and edges to the graph
for line in lines:
    # Split the line using either tab or space as the separator
    parts = re.split(r'\t|\s+', line)  

    # Extract values into variables
    source_community = parts[0]
    target_community = parts[1]
    post_id_source = parts[2]
    timestamp_source = parts[3] + " " + parts[4]  # Combine date and time
    user = parts[5]
    post_id_target = parts[6]
    timestamp_target = parts[7] + " " + parts[8]  # Combine date and time

    # Add nodes if they don't exist
    if source_community not in community_to_id:
        source_community_id = G.AddNode()
        community_to_id[source_community] = source_community_id

    if target_community not in community_to_id:
        target_community_id = G.AddNode()
        community_to_id[target_community] = target_community_id

    # Convert alphanumeric post IDs to integers
    post_id_source_int = convert_to_integer(post_id_source)
    post_id_target_int = convert_to_integer(post_id_target)

    # Add edge
    edge_id = G.AddEdge(source_community_id, target_community_id)

    # Print original post IDs
    print("Original source post ID:", post_id_source)
    print("Original target post ID:", post_id_target)

    # Add attributes for source and target post IDs
    G.AddIntAttrDatE(edge_id, post_id_source_int, "source_post_id")
    G.AddIntAttrDatE(edge_id, post_id_target_int, "target_post_id")

# Print the total number of nodes and edges
print("Total Nodes:", G.GetNodes())
print("Total Edges:", G.GetEdges())


Original source post ID: 1u4nrp
Original target post ID: 1u4lo2
Original source post ID: 1u4qkd
Original target post ID: 1u4muc
Original source post ID: 1u4qlz
Original target post ID: 1u4pss
Original source post ID: 1u4sjv
Original target post ID: 1u4mjo
Original source post ID: 1u4w5s
Original target post ID: 1u54ij
Original source post ID: 1u4w7b
Original target post ID: 1u4yal
Original source post ID: 1u4wfe
Original target post ID: 1u4sow
Original source post ID: 1u50po
Original target post ID: 1u4o4u
Original source post ID: 1u5ccu
Original target post ID: 1u5e1w
Original source post ID: 1u5ccu
Original target post ID: 1u5mot
Original source post ID: 1u5df2
Original target post ID: 1ujxr4
Original source post ID: 1u5iet
Original target post ID: 1u5ivk
Original source post ID: 1u5iet
Original target post ID: 1u5mot
Original source post ID: 1u5k33
Original target post ID: 1u5jsk
Original source post ID: 1u5olg
Original target post ID: 1uazr5
Original source post ID: 1u5q84
Original

ValueError: invalid literal for int() with base 36: '_'

In [26]:
# Read content from the existing file
with open('processed_post_crosslinks_info.txt', 'r') as file:
    lines = file.readlines()

# Process each line and join parts with a single space
formatted_lines = []
for line in lines:
    # Split by tabs or spaces (one or more)
    parts = re.split(r'\t|\s+', line.strip())
    
    # Remove empty strings from the list
    parts = [part for part in parts if part]
    
    # Check if part 6 starts with a digit
    if len(parts) > 6 and re.search(r'^\d', parts[6]):
        # Parts 5 and 6 represent user and post ID, keep them separate
        merged_user = parts[5]
    else:
        # Concatenate parts 5 and 6 with '0'
        merged_user = parts[5] + '0' + parts[6]
        # Replace parts 5 and 6 with the merged user
        parts[5:7] = [merged_user]
    
    # Join parts with a single space
    formatted_line = ' '.join(parts)
    
    # Append to the list
    formatted_lines.append(formatted_line)

# Write the modified content back to the file
with open('formatted_data_file.txt', 'w') as file:
    file.write('\n'.join(formatted_lines))
