In [None]:
import json

from src.data import keep_only_text_posts, parse_telegram_message
from src.data_utils import print_md
from src.graph import (
    create_graph,
    filter_edges_by_threshold,
    find_similar_posts_pagerank,
    get_graph_plot,
    scale_edge_weights,
)
from src.processing import calculate_idf, preprocess_text

In [None]:
INPUT_DATA = "data/result.json"

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"

Data processing

In [None]:
with open(INPUT_DATA) as file:
    data = json.load(file)
    print("all:", len(data['messages']))

data = keep_only_text_posts(data)
print("with text:", len(data))

# extract all useful information in all useful formats
posts = [parse_telegram_message(m) for m in data]

# extract words and normalize them
posts = [{**d, **preprocess_text(d['text_no_links'])} for d in posts]

# channel_id -> post_id -> post
posts_view = {
    chn: {post["id"]: post for post in posts if post["from_id"] == chn}
    for chn in set([post["from_id"] for post in posts])
}

Graph preparation

In [None]:
idf_scores = calculate_idf(posts, min_max_scale=True)

G = create_graph(posts, idf_scores)

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Filter graph

In [None]:
G = scale_edge_weights(G)

print("Full graph:", len(G.edges))
G_filtered = filter_edges_by_threshold(G, threshold=0.4)
print("Filtered graph:", len(G_filtered.edges))

Testing

Visualization

In [None]:
fig = get_graph_plot(G_filtered, idf_scores)
fig.show()

In [None]:
# Example usage
post_id = 2759  # Replace with the desired post ID
top_n = 5  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered, post_id, top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")

In [None]:
# Example: Accessing edge weights
for u, v, data in G_filtered.edges(552, data=True):
    weight = data.get('weight', 0)  # Defaulting to 0 if weight not present
    print(f"Edge from {u} to {v} with weight: {weight}")

In [None]:
# TODO: improve comparison

# a = set(posts_view["channel1150855655"][545]['stemmed_words'])
# b = set(posts_view["channel1150855655"][905]['stemmed_words'])

# print(posts_view["channel1150855655"][700]['text'])

# a.intersection(b)
# G_filtered.nodes[545]

Save

In [None]:
import json
import networkx as nx

G_converted = G_filtered.copy()

for node, attrs in G_converted.nodes(data=True):
    for attr_key, attr_value in attrs.items():
        attrs[attr_key] = json.dumps(attr_value)

nx.write_graphml(G_converted, GRAPH_FILE_PATH)

In [None]:
with open(POSTS_FILE_PATH, "w") as file:
    json.dump(posts, file, indent=4, ensure_ascii=False)

with open(POSTS_VIEW_FILE_PATH, "w") as file:
    json.dump(posts_view, file, indent=4, ensure_ascii=False)

Check loading

NOTE: keys from dict from int transform into str after loading

In [None]:
import networkx as nx

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"


def load_resources():
    # Load the graph
    G = nx.read_graphml(GRAPH_FILE_PATH)
    
    # Convert node attributes back from JSON strings
    for node, attrs in G.nodes(data=True):
        for attr_key, attr_value in attrs.items():
            try:
                # Attempt to load the attribute value from JSON string
                attrs[attr_key] = json.loads(attr_value)
            except json.JSONDecodeError:
                # In case the value is not a JSON string, keep it as is
                pass

    # Assuming loading posts and posts_view as before
    with open(POSTS_FILE_PATH) as f:
        posts = json.load(f)
    with open(POSTS_VIEW_FILE_PATH) as f:
        posts_view = json.load(f)
    
    return G, posts, posts_view

G_filtered_2, posts_2, posts_view_2 = load_resources()

In [None]:
G_filtered_2 == G_filtered

In [None]:
len(G_filtered_2), len(G_filtered)

In [None]:
len(G_filtered_2.edges), len(G_filtered.edges)

In [None]:
list(posts_view.values())[0][65]

In [None]:
list(posts_view_2.values())[0]["65"]

In [None]:
# Example usage
post_id = 545  # Replace with the desired post ID
top_n = 10  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered_2, str(post_id), top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")