In [10]:
import json
from os import listdir, mkdir
from os.path import isfile
import networkx as nx
from contextlib import suppress
from tqdm import tqdm
from datetime import datetime

MAX_TIME_DIFF = 1
MIN_SUBGRAPH_EDGES = 5

In [None]:
# NOT USED FOR NOW
additional_edges = []
n = len(node_list)
count = 0

for i,node1 in enumerate(node_list):
    for node2 in node_list[(i+1):]:
        diff = tweet_timedelta(node1, node2)

        if abs(diff) < MAX_TIME_DIFF:
            if diff < 0:
                additional_edges.append((node1.user_id, node2.user_id))
            else:
                additional_edges.append((node2.user_id, node1.user_id))
            count += 1

print(f"added {count} of {n*(n-1)/2})")

In [2]:
def str_to_time(timestamp):
    format = "%a %b %d %H:%M:%S %z %Y"
    return datetime.strptime(timestamp, format).replace(tzinfo=None)

def months_from_creation(date):
    twitter_creation = datetime.strptime("Mar 1 2006", "%b %d %Y")
    return (date.year - twitter_creation.year)*12 + date.month - twitter_creation.month

def save_edge_list(edge_list, pathname):
    with open(pathname, "w") as f:
        for (u,v) in edge_list:
            f.write(f"{u}, {v}\n")

def save_node_features(node_features, pathname):
    with open(pathname, "w") as f:
        for user_id, vector in node_features.items():
            feature_str = ", ".join(list(map(str, vector)))
            f.write(f"{user_id}, {feature_str}\n")

def tweet_hours_diff(x, y):
    return round((x.created_at - y.created_at).total_seconds() / 3600, 2)

In [3]:
class TwitterNode(object):
    def __init__(self, pathname) -> None:
        with open(pathname) as f:
            node = json.load(f)
        
        user = node['user']
        self.followers_count = user['followers_count']
        self.friends_count = user['friends_count']
        self.statuses_count = user['statuses_count']
        self.favourites_count = user['favourites_count']
        self.lists_count = user['listed_count']
        self.verified = int(user['verified'])
        self.user_created_at = months_from_creation(str_to_time(user['created_at']))
        self.user_id = user['id']
        self.tweet_id = node['id']
        self.mentions = [x['id'] for x in node['entities']['user_mentions']]
        self.created_at = str_to_time(node['created_at'])
    
    def get_features_vector(self):
        return [self.verified, self.user_created_at, self.followers_count,
                self.friends_count, self.lists_count, self.favourites_count,
                self.statuses_count, self.created_at]

In [20]:
main_path = "./dataset/FakeNewsNet/"

for dataset in ["politifact", "gossipcop"]:
    for label in ["real", "fake"]:
        path = f"{main_path}/{dataset}/{label}"
        tweets_path = f"{path}/tweets"
        graphs_path = f"{path}/subgraphs"
        features_path = f"{path}/features"

        with suppress(FileExistsError):
            mkdir(graphs_path)

        with suppress(FileExistsError):
            mkdir(features_path)

        edge_lists = {}
        nodes_features = {}

        tweets_files = listdir(tweets_path)
        total_news = len(news_files)

        print(f"Processing {dataset}-{label}: {len(tweets_files)} news found.")

        for i,news_id in enumerate(filter(lambda x: dataset in x, tweets_files)):
            if i % 10 == 0:
                print(f"{i}/{total_news}")
            
            news_path = f"{tweets_path}/{news_id}"
            subgraph_pathname = f"{graphs_path}/{news_id}.txt"
            features_pathname = f"{features_path}/{news_id}.txt"

            if isfile(subgraph_pathname) and isfile(features_pathname):
                continue

            try:
                json_files = listdir(news_path)
            except NotADirectoryError as e:
                continue

            edge_lists[news_id] = []
            nodes_features[news_id] = {}
            node_list = []

            for filename in filter(lambda x: "json" in x, json_files):
                node_path = f"{news_path}/{filename}"
                node = TwitterNode(node_path)
                node_list.append(node)

                # news node edge to each root tweet
                edge_lists[news_id].append((0, node.user_id))

            # adding extra edges for mentions of users in the subgraph
            users_with_tweet = set([x.user_id for x in node_list])

            for node in node_list:
                user_mentioned = set(node.mentions) & users_with_tweet
                edge_lists[news_id].extend([(node.user_id, x) for x in user_mentioned])

            if len(node_list) == 0:
                continue

            min_time = min([x.created_at for x in node_list])

            # update tweet timestamp to seconds since first tweet of the news
            for node in node_list:
                node.created_at = int((node.created_at - min_time).total_seconds())
                nodes_features[news_id][node.user_id] = node.get_features_vector()
            
            if len(edge_lists[news_id]) >= MIN_SUBGRAPH_EDGES:
                save_edge_list(edge_lists[news_id], subgraph_pathname)
                save_node_features(nodes_features[news_id], features_pathname)

Processing politifact-real: 409 news found.
0/411


FileNotFoundError: [WinError 3] Impossibile trovare il percorso specificato: './dataset/FakeNewsNet//politifact/real/politifact100'