## PHEME Dataset

In [None]:
import os
import json
import pandas as pd

In [None]:
pheme_dir = 'data/pheme-rnr-dataset'

In [None]:
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def load_pheme_dataset(base_dir):
    data = []
    subdirs = ['rumours', 'non-rumours']

    for event in os.listdir(base_dir):
        for subdir in subdirs:
            event_dir = os.path.join(base_dir, event, subdir)
            if not os.path.isdir(event_dir):
                continue
            for rumor in os.listdir(event_dir):
                rumor_dir = os.path.join(event_dir, rumor)
                source_tweet_file = os.path.join(rumor_dir, 'source-tweet', f'{rumor}.json')
                reactions_dir = os.path.join(rumor_dir, 'reactions')

                if os.path.isfile(source_tweet_file):
                    source_tweet = read_json_file(source_tweet_file)
                    data.append({
                        'event': event,
                        'tweet_id': source_tweet['id_str'],
                        'text': source_tweet['text'],
                        'user': source_tweet['user']['screen_name'],
                        'timestamp': source_tweet['created_at'],
                        'type': 'source',
                        'is_rumour': True if subdir == 'rumours' else False
                    })

                if os.path.isdir(reactions_dir):
                    for reaction_file in os.listdir(reactions_dir):
                        reaction_path = os.path.join(reactions_dir, reaction_file)
                        reaction_tweet = read_json_file(reaction_path)
                        data.append({
                            'event': event,
                            'tweet_id': reaction_tweet['id_str'],
                            'text': reaction_tweet['text'],
                            'user': reaction_tweet['user']['screen_name'],
                            'timestamp': reaction_tweet['created_at'],
                            'type': 'reaction',
                            'is_rumour': True if subdir == 'rumours' else False
                        })

    return pd.DataFrame(data)

In [None]:
pheme_df = load_pheme_dataset(pheme_dir)

In [None]:
pheme_df.to_csv('data/csv/pheme_dataset.csv', index=False)

In [None]:
print(pheme_df)