In [1]:
from yadbil.data.telegram import TelegramDataProcessor
from yadbil.data.processing import calculate_idf

from yadbil.data.utils import print_md
from yadbil.graph import find_similar_posts_pagerank, get_graph_plot, GraphProcessor

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Vasilii_Salikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Vasilii_Salikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
INPUT_DATA = "data/result.json"

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"

Data processing

In [None]:
tg_data_proc = TelegramDataProcessor(INPUT_DATA).process_posts()
posts = tg_data_proc.posts
posts_view = tg_data_proc.posts_view

idf_scores = calculate_idf(posts, min_max_scale=True)

Graph preparation

In [None]:
graph_proc = GraphProcessor(posts, idf_scores)

print(f"Number of nodes: {graph_proc.G.number_of_nodes()}")
print(f"Number of edges: {graph_proc.G.number_of_edges()}")

Filter graph

In [None]:
print("Full graph:", len(graph_proc.G.edges))
graph_proc.scale_edge_weights()
G_filtered = graph_proc.filter_edges_by_threshold(threshold=0.4)
print("Filtered graph:", len(G_filtered.edges))

Testing

Visualization

In [None]:
fig = get_graph_plot(G_filtered, idf_scores)
fig.show()

In [None]:
# Example usage
post_id = 2759  # Replace with the desired post ID
top_n = 5  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered, post_id, top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")

In [None]:
# Example: Accessing edge weights
for u, v, data in G_filtered.edges(552, data=True):
    weight = data.get('weight', 0)  # Defaulting to 0 if weight not present
    print(f"Edge from {u} to {v} with weight: {weight}")

In [None]:
# TODO: improve comparison

# a = set(posts_view["channel1150855655"][545]['stemmed_words'])
# b = set(posts_view["channel1150855655"][905]['stemmed_words'])

# print(posts_view["channel1150855655"][700]['text'])

# a.intersection(b)
# G_filtered.nodes[545]

Save

In [None]:
import json
import networkx as nx

G_converted = G_filtered.copy()

for node, attrs in G_converted.nodes(data=True):
    for attr_key, attr_value in attrs.items():
        attrs[attr_key] = json.dumps(attr_value)

nx.write_graphml(G_converted, GRAPH_FILE_PATH)

In [None]:
with open(POSTS_FILE_PATH, "w") as file:
    json.dump(posts, file, indent=4, ensure_ascii=False)

with open(POSTS_VIEW_FILE_PATH, "w") as file:
    json.dump(posts_view, file, indent=4, ensure_ascii=False)

Check loading

NOTE: keys from dict from int transform into str after loading

In [None]:
from yadbil.graph.io import load_resources

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"

G_filtered_2, posts_2, posts_view_2 = load_resources(
    GRAPH_FILE_PATH, POSTS_FILE_PATH, POSTS_VIEW_FILE_PATH
)

In [None]:
# G_filtered_2 == G_filtered
# len(G_filtered_2), len(G_filtered)
# len(G_filtered_2.edges), len(G_filtered.edges)
# list(posts_view.values())[0][65]
# list(posts_view_2.values())[0]["65"]

In [None]:
# Example usage
post_id = 545  # Replace with the desired post ID
top_n = 10  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered_2, str(post_id), top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")