In [1]:
from os import listdir, mkdir
from os.path import isfile
from utils import *

In [26]:
main_path = "./dataset/FakeNewsNet/"
CLIP_TWEETS = 100   # build subgraphs of the first tweets

for dataset in ["politifact"]:
    for label in ["real", "fake"]:
        path = f"{main_path}/{dataset}/{label}"
        tweets_path = f"{path}/tweets"
        graphs_path = f"{path}/subgraphs"
        features_path = f"{path}/features"

        with suppress(FileExistsError):
            mkdir(graphs_path)

        with suppress(FileExistsError):
            mkdir(features_path)

        edge_lists = {}
        nodes_features = {}

        tweets_files = listdir(tweets_path)
        total_news = len(tweets_files)

        print(f"Processing {dataset}-{label}: {len(tweets_files)} news found.")

        for i,news_id in enumerate(filter(lambda x: dataset in x, tweets_files)):
            if i % 50 == 0:
                print(f"{i}/{total_news}")
            
            news_path = f"{tweets_path}/{news_id}"
            subgraph_pathname = f"{graphs_path}/{news_id}.txt"
            features_pathname = f"{features_path}/{news_id}.txt"

            if isfile(subgraph_pathname) and isfile(features_pathname):
                continue

            try:
                json_files = listdir(news_path)
            except NotADirectoryError as e:
                continue

            edge_lists[news_id] = []
            nodes_features[news_id] = {}
            node_list = []

            for filename in filter(lambda x: "json" in x, json_files):
                node_path = f"{news_path}/{filename}"
                node = TwitterNode(node_path)
                node_list.append(node)
            
            if len(node_list) == 0:
                continue
            
            # sort nodes by time and clip
            node_list = sorted(node_list, key=lambda x: x.created_at)[:CLIP_TWEETS]

            # news node edge to each root tweet
            edge_lists[news_id] = [(0, node.user_id) for node in node_list]

            # adding extra edges of users who mentions other who tweeted the same news
            users_with_tweet = set([x.user_id for x in node_list])

            for node in node_list:
                user_mentioned = set(node.mentions) & users_with_tweet
                edge_lists[news_id].extend([(node.user_id, x) for x in user_mentioned])

            # adding extra edges for tweets made within timelimit
            reversed_node_list = node_list[::-1]

            for i,u in enumerate(reversed_node_list):
                for v in reversed_node_list[(i+1):]:
                    if tweet_hours_diff(u, v) < MAX_TIME_DIFF:
                        edge_lists[news_id].append((v.user_id, u.user_id)) # v tweeted before u within maxtime

            min_time = min([x.created_at for x in node_list])

            # update tweet timestamp to seconds since first tweet of the news
            for node in node_list:
                node.created_at = int((node.created_at - min_time).total_seconds())
                nodes_features[news_id][node.user_id] = node.get_features_vector()
            
            if len(edge_lists[news_id]) >= MIN_SUBGRAPH_EDGES:
                save_edge_list(edge_lists[news_id], subgraph_pathname)
                save_node_features(nodes_features[news_id], features_pathname)

Processing politifact-real: 409 news found.
0/409
10/409
20/409
30/409
40/409
50/409
60/409
70/409
80/409
90/409
100/409
110/409
120/409
130/409
140/409
150/409
160/409
170/409
180/409
190/409
200/409
210/409
220/409
230/409
240/409
250/409
260/409
270/409
280/409
290/409
300/409
310/409
320/409
330/409
340/409
350/409
360/409
370/409
380/409
390/409
400/409
Processing politifact-fake: 392 news found.
0/392
10/392
20/392
30/392
40/392
50/392
60/392
70/392
80/392
90/392
100/392
110/392
120/392
130/392
140/392
150/392
160/392
170/392
180/392
190/392
200/392
210/392
220/392
230/392
240/392
250/392
260/392
270/392
280/392
290/392
300/392
310/392
320/392
330/392
340/392
350/392
360/392
370/392
380/392
390/392
