In [1]:
import json

from src.data.telegram import TelegramDataProcessor
from src.data.processing import calculate_idf

from src.data.utils import print_md
from src.graph import (
    create_graph,
    filter_edges_by_threshold,
    find_similar_posts_pagerank,
    get_graph_plot,
    scale_edge_weights,
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Vasilii_Salikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Vasilii_Salikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
INPUT_DATA = "data/result.json"

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"

Data processing

In [3]:
processor = TelegramDataProcessor(INPUT_DATA).process_posts()
posts = processor.posts
posts_view = processor.posts_view

All messages: 2732
Messages with text: 976


Graph preparation

In [4]:
idf_scores = calculate_idf(posts, min_max_scale=True)

G = create_graph(posts, idf_scores)

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 976
Number of edges: 214888


Filter graph

In [5]:
G = scale_edge_weights(G)

print("Full graph:", len(G.edges))
G_filtered = filter_edges_by_threshold(G, threshold=0.4)
print("Filtered graph:", len(G_filtered.edges))

Full graph: 214888
Filtered graph: 423


Testing

Visualization

In [6]:
fig = get_graph_plot(G_filtered, idf_scores)
fig.show()

In [7]:
# Example usage
post_id = 2759  # Replace with the desired post ID
top_n = 5  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered, post_id, top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")

Top 5 similar posts to post 2759:
Post ID: 2758, Similarity Score: 0.3316
Post ID: 2757, Similarity Score: 0.3056
Post ID: 552, Similarity Score: 0.0000
Post ID: 776, Similarity Score: 0.0000
Post ID: 1734, Similarity Score: 0.0000


In [8]:
# Example: Accessing edge weights
for u, v, data in G_filtered.edges(552, data=True):
    weight = data.get('weight', 0)  # Defaulting to 0 if weight not present
    print(f"Edge from {u} to {v} with weight: {weight}")

Edge from 552 to 227 with weight: 0.45025674071053085
Edge from 552 to 324 with weight: 0.4015768181096589
Edge from 552 to 342 with weight: 0.44416935668637614
Edge from 552 to 434 with weight: 0.4196336951218653
Edge from 552 to 435 with weight: 0.45923184084271934
Edge from 552 to 590 with weight: 0.4113182386741697
Edge from 552 to 665 with weight: 0.4538394630301244
Edge from 552 to 666 with weight: 0.4215545333409331
Edge from 552 to 719 with weight: 0.4222516196748406
Edge from 552 to 800 with weight: 0.40104974325121767
Edge from 552 to 801 with weight: 0.47490245912022966
Edge from 552 to 819 with weight: 0.6414631217109287
Edge from 552 to 820 with weight: 0.5314111964648126
Edge from 552 to 905 with weight: 0.40607366004502277
Edge from 552 to 906 with weight: 0.49286327815552355


In [None]:
# TODO: improve comparison

# a = set(posts_view["channel1150855655"][545]['stemmed_words'])
# b = set(posts_view["channel1150855655"][905]['stemmed_words'])

# print(posts_view["channel1150855655"][700]['text'])

# a.intersection(b)
# G_filtered.nodes[545]

Save

In [9]:
import json
import networkx as nx

G_converted = G_filtered.copy()

for node, attrs in G_converted.nodes(data=True):
    for attr_key, attr_value in attrs.items():
        attrs[attr_key] = json.dumps(attr_value)

nx.write_graphml(G_converted, GRAPH_FILE_PATH)

In [10]:
with open(POSTS_FILE_PATH, "w") as file:
    json.dump(posts, file, indent=4, ensure_ascii=False)

with open(POSTS_VIEW_FILE_PATH, "w") as file:
    json.dump(posts_view, file, indent=4, ensure_ascii=False)

Check loading

NOTE: keys from dict from int transform into str after loading

In [11]:
import networkx as nx

GRAPH_FILE_PATH = "data/filtered_graph.graphml"
POSTS_FILE_PATH = "data/posts.json"
POSTS_VIEW_FILE_PATH = "data/posts_view.json"


def load_resources():
    # Load the graph
    G = nx.read_graphml(GRAPH_FILE_PATH)
    
    # Convert node attributes back from JSON strings
    for node, attrs in G.nodes(data=True):
        for attr_key, attr_value in attrs.items():
            try:
                # Attempt to load the attribute value from JSON string
                attrs[attr_key] = json.loads(attr_value)
            except json.JSONDecodeError:
                # In case the value is not a JSON string, keep it as is
                pass

    # Assuming loading posts and posts_view as before
    with open(POSTS_FILE_PATH) as f:
        posts = json.load(f)
    with open(POSTS_VIEW_FILE_PATH) as f:
        posts_view = json.load(f)
    
    return G, posts, posts_view

G_filtered_2, posts_2, posts_view_2 = load_resources()

In [12]:
G_filtered_2 == G_filtered

False

In [13]:
len(G_filtered_2), len(G_filtered)

(272, 272)

In [14]:
len(G_filtered_2.edges), len(G_filtered.edges)

(423, 423)

In [15]:
list(posts_view.values())[0][65]

{'id': 65,
 'from': 'gonzo-–æ–±–∑–æ—Ä—ã ML —Å—Ç–∞—Ç–µ–π',
 'from_id': 'channel1150855655',
 'text': '3.3. Sparse Transformer, 2019 april, OpenAI\n–ë–ª–æ–≥–æ–ø–æ—Å—Ç: https://openai.com/blog/sparse-transformer/\n–°—Ç–∞—Ç—å—è: https://arxiv.org/abs/1904.10509\n–ö–æ–¥: https://github.com/openai/sparse_attention\n\n–ú–æ–¥–∏—Ñ–∏–∫–∞—Ü–∏—è –º–µ—Ö–∞–Ω–∏–∑–º–∞ –≤–Ω–∏–º–∞–Ω–∏—è, –ø–æ–∑–≤–æ–ª—è—é—â–∞—è —É–≤–µ–ª–∏—á–∏—Ç—å –¥–ª–∏–Ω—É –≤—Ö–æ–¥–∞. –ù–∞ self-reported —Ç–µ—Å—Ç–∞—Ö –±–æ–ª–µ–µ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–µ–Ω —á–µ–º Transformer-XL\n\n–ü—Ä–∏ –≤—ã—Å—á–∏—Å–ª–µ–Ω–∏–∏ –æ–±—ã—á–Ω–æ–≥–æ –≤–Ω–∏–º–∞–Ω–∏—è —Å–ª–æ–∂–Ω–æ—Å—Ç—å –≤—ã—á–∏—Å–ª–µ–Ω–∏–π –•^2, –≥–¥–µ –• -- –¥–ª–∏–Ω–∞ –≤—Ö–æ–¥–∞, —Ç.–∫. –º—ã —Å—á–∏—Ç–∞–µ–º –≤–Ω–∏–º–∞–Ω–∏–µ —Å –∫–∞–∂–¥–æ–≥–æ —ç–ª–µ–º–µ–Ω—Ç–∞ –Ω–∞ –∫–∞–∂–¥—ã–π. –î—Ä—É–≥–∏–º–∏ —Å–ª–æ–≤–∞–º–∏, –≤–Ω–∏–º–∞–Ω–∏–µ —ç—Ç–æ –∫–≤–∞–¥—Ä–∞—Ç–Ω–∞—è —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–∞—è –º–∞—Ç—Ä–∏—Ü–∞. –û–∫–∞–∑—ã–≤–∞–µ—Ç—Å—è, –µ—Å–ª–∏ –¥–æ–ª–≥–æ —Å–º–æ—Ç—Ä–µ—Ç—å –Ω–∞ —ç—Ç–∏ –º–∞—Ç—Ä–∏—Ü—ã, –º–æ–∂–Ω–æ —É–∑—Ä–µ—Ç—å

In [16]:
list(posts_view_2.values())[0]["65"]

{'id': 65,
 'from': 'gonzo-–æ–±–∑–æ—Ä—ã ML —Å—Ç–∞—Ç–µ–π',
 'from_id': 'channel1150855655',
 'text': '3.3. Sparse Transformer, 2019 april, OpenAI\n–ë–ª–æ–≥–æ–ø–æ—Å—Ç: https://openai.com/blog/sparse-transformer/\n–°—Ç–∞—Ç—å—è: https://arxiv.org/abs/1904.10509\n–ö–æ–¥: https://github.com/openai/sparse_attention\n\n–ú–æ–¥–∏—Ñ–∏–∫–∞—Ü–∏—è –º–µ—Ö–∞–Ω–∏–∑–º–∞ –≤–Ω–∏–º–∞–Ω–∏—è, –ø–æ–∑–≤–æ–ª—è—é—â–∞—è —É–≤–µ–ª–∏—á–∏—Ç—å –¥–ª–∏–Ω—É –≤—Ö–æ–¥–∞. –ù–∞ self-reported —Ç–µ—Å—Ç–∞—Ö –±–æ–ª–µ–µ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–µ–Ω —á–µ–º Transformer-XL\n\n–ü—Ä–∏ –≤—ã—Å—á–∏—Å–ª–µ–Ω–∏–∏ –æ–±—ã—á–Ω–æ–≥–æ –≤–Ω–∏–º–∞–Ω–∏—è —Å–ª–æ–∂–Ω–æ—Å—Ç—å –≤—ã—á–∏—Å–ª–µ–Ω–∏–π –•^2, –≥–¥–µ –• -- –¥–ª–∏–Ω–∞ –≤—Ö–æ–¥–∞, —Ç.–∫. –º—ã —Å—á–∏—Ç–∞–µ–º –≤–Ω–∏–º–∞–Ω–∏–µ —Å –∫–∞–∂–¥–æ–≥–æ —ç–ª–µ–º–µ–Ω—Ç–∞ –Ω–∞ –∫–∞–∂–¥—ã–π. –î—Ä—É–≥–∏–º–∏ —Å–ª–æ–≤–∞–º–∏, –≤–Ω–∏–º–∞–Ω–∏–µ —ç—Ç–æ –∫–≤–∞–¥—Ä–∞—Ç–Ω–∞—è —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–∞—è –º–∞—Ç—Ä–∏—Ü–∞. –û–∫–∞–∑—ã–≤–∞–µ—Ç—Å—è, –µ—Å–ª–∏ –¥–æ–ª–≥–æ —Å–º–æ—Ç—Ä–µ—Ç—å –Ω–∞ —ç—Ç–∏ –º–∞—Ç—Ä–∏—Ü—ã, –º–æ–∂–Ω–æ —É–∑—Ä–µ—Ç—å

In [17]:
# Example usage
post_id = 545  # Replace with the desired post ID
top_n = 10  # Replace with the desired number of top similar posts

similar_posts = find_similar_posts_pagerank(G_filtered_2, str(post_id), top_n)

print(f"Top {top_n} similar posts to post {post_id}:")
for post, score in similar_posts:
    print(f"Post ID: {post}, Similarity Score: {score:.4f}")

Top 10 similar posts to post 545:
Post ID: 905, Similarity Score: 0.0904
Post ID: 700, Similarity Score: 0.0664
Post ID: 906, Similarity Score: 0.0548
Post ID: 523, Similarity Score: 0.0377
Post ID: 966, Similarity Score: 0.0333
Post ID: 530, Similarity Score: 0.0295
Post ID: 552, Similarity Score: 0.0236
Post ID: 776, Similarity Score: 0.0217
Post ID: 941, Similarity Score: 0.0214
Post ID: 942, Similarity Score: 0.0207
