In [1]:
from os import listdir, mkdir
from os.path import isfile
from utils import *

In [27]:
main_path = "./dataset/FakeNewsNet/"
CLIP_TWEETS = 100   # build subgraphs of the first tweets

for dataset in ["politifact", "gossipcop"]:
    for label in ["real", "fake"]:
        path = f"{main_path}/{dataset}/{label}"
        tweets_path = f"{path}/tweets"
        graphs_path = f"{path}/subgraphs"
        features_path = f"{path}/features"

        with suppress(FileExistsError):
            mkdir(graphs_path)

        with suppress(FileExistsError):
            mkdir(features_path)

        edge_lists = {}
        nodes_features = {}

        tweets_files = listdir(tweets_path)
        total_news = len(tweets_files)

        print(f"Processing {dataset}-{label}: {len(tweets_files)} news found.")

        for i,news_id in enumerate(filter(lambda x: dataset in x, tweets_files)):
            if i % 50 == 0:
                print(f"{i}/{total_news}")
            
            news_path = f"{tweets_path}/{news_id}"
            subgraph_pathname = f"{graphs_path}/{news_id}.txt"
            features_pathname = f"{features_path}/{news_id}.txt"

            if isfile(subgraph_pathname) and isfile(features_pathname):
                continue

            try:
                json_files = listdir(news_path)
            except NotADirectoryError as e:
                continue

            edge_lists[news_id] = []
            nodes_features[news_id] = {}
            node_list = []

            for filename in filter(lambda x: "json" in x, json_files):
                node_path = f"{news_path}/{filename}"
                node = TwitterNode(node_path)
                node_list.append(node)
            
            if len(node_list) == 0:
                continue
            
            # sort nodes by time and clip
            node_list = sorted(node_list, key=lambda x: x.created_at)[:CLIP_TWEETS]

            # news node edge to each root tweet
            edge_lists[news_id] = [(0, node.user_id) for node in node_list]

            # adding extra edges of users who mentions other who tweeted the same news
            users_with_tweet = set([x.user_id for x in node_list])

            for node in node_list:
                user_mentioned = set(node.mentions) & users_with_tweet
                edge_lists[news_id].extend([(node.user_id, x) for x in user_mentioned])

            # adding extra edges for tweets made within timelimit
            reversed_node_list = node_list[::-1]

            for i,u in enumerate(reversed_node_list):
                for v in reversed_node_list[(i+1):]:
                    if tweet_hours_diff(u, v) < MAX_TIME_DIFF:
                        edge_lists[news_id].append((v.user_id, u.user_id)) # v tweeted before u within maxtime

            min_time = min([x.created_at for x in node_list])

            # update tweet timestamp to seconds since first tweet of the news
            for node in node_list:
                node.created_at = int((node.created_at - min_time).total_seconds())
                nodes_features[news_id][node.user_id] = node.get_features_vector()
            
            if len(edge_lists[news_id]) >= MIN_SUBGRAPH_EDGES:
                save_edge_list(edge_lists[news_id], subgraph_pathname)
                save_node_features(nodes_features[news_id], features_pathname)

Processing politifact-real: 409 news found.
0/409
50/409
100/409
150/409
200/409
250/409
300/409
350/409
400/409
Processing politifact-fake: 392 news found.
0/392
50/392
100/392
150/392
200/392
250/392
300/392
350/392
Processing gossipcop-real: 15759 news found.
0/15759
50/15759
100/15759
150/15759
200/15759
250/15759
300/15759
350/15759
400/15759
450/15759
500/15759
550/15759
600/15759
650/15759
700/15759
750/15759
800/15759
850/15759
900/15759
950/15759
1000/15759
1050/15759
1100/15759
1150/15759
1200/15759
1250/15759
1300/15759
1350/15759
1400/15759
1450/15759
1500/15759
1550/15759
1600/15759
1650/15759
1700/15759
1750/15759
1800/15759
1850/15759
1900/15759
1950/15759
2000/15759
2050/15759
2100/15759
2150/15759
2200/15759
2250/15759
2300/15759
2350/15759
2400/15759
2450/15759
2500/15759
2550/15759
2600/15759
2650/15759
2700/15759
2750/15759
2800/15759
2850/15759
2900/15759
2950/15759
3000/15759
3050/15759
3100/15759
3150/15759
3200/15759
3250/15759
3300/15759
3350/15759
3400/15759
3