# Import Dependencies

In [16]:
import os
import json
import pandas as pd
import dask.dataframe as dd

# Directories

## PHEME Dataset

In [1]:
pheme_dir = 'data/pheme-rnr-dataset'

## Credbank Dataset

In [5]:
search_tweets_path = 'data/credbank-dataset/cred_event_SearchTweets.data'
turk_ratings_path = 'data/credbank-dataset/cred_event_TurkRatings.data'
annotations_path = 'data/credbank-dataset/eventNonEvent_annotations.data'
stream_tweets_path = 'data/credbank-dataset/stream_tweets_byTimestamp.data'

# Import the datasets

## PHEME Dataset

In [6]:
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def load_pheme_dataset(base_dir):
    data = []
    subdirs = ['rumours', 'non-rumours']

    for event in os.listdir(base_dir):
        for subdir in subdirs:
            event_dir = os.path.join(base_dir, event, subdir)
            if not os.path.isdir(event_dir):
                continue
            for rumor in os.listdir(event_dir):
                rumor_dir = os.path.join(event_dir, rumor)
                source_tweet_file = os.path.join(rumor_dir, 'source-tweet', f'{rumor}.json')
                reactions_dir = os.path.join(rumor_dir, 'reactions')

                if os.path.isfile(source_tweet_file):
                    source_tweet = read_json_file(source_tweet_file)
                    data.append({
                        'event': event,
                        'tweet_id': source_tweet['id_str'],
                        'text': source_tweet['text'],
                        'user': source_tweet['user']['screen_name'],
                        'timestamp': source_tweet['created_at'],
                        'type': 'source',
                        'is_rumour': 1 if subdir == 'rumours' else 0
                    })

                if os.path.isdir(reactions_dir):
                    for reaction_file in os.listdir(reactions_dir):
                        reaction_path = os.path.join(reactions_dir, reaction_file)
                        reaction_tweet = read_json_file(reaction_path)
                        data.append({
                            'event': event,
                            'tweet_id': reaction_tweet['id_str'],
                            'text': reaction_tweet['text'],
                            'user': reaction_tweet['user']['screen_name'],
                            'timestamp': reaction_tweet['created_at'],
                            'type': 'reaction',
                            'is_rumour': 1 if subdir == 'rumours' else 0
                        })

    return pd.DataFrame(data)

In [7]:
pheme_df = load_pheme_dataset(pheme_dir)

In [8]:
pheme_df.to_csv('data/csv/pheme_dataset.csv', index=False)

In [9]:
print(pheme_df)

               event            tweet_id  \
0       charliehebdo  552783238415265792   
1       charliehebdo  552787794503143424   
2       charliehebdo  552789647966109696   
3       charliehebdo  552791411053973505   
4       charliehebdo  552793152390955009   
...              ...                 ...   
103207   sydneysiege  544526845966704641   
103208   sydneysiege  544526940636323840   
103209   sydneysiege  544527197810475011   
103210   sydneysiege  544531943334100992   
103211   sydneysiege  544535839657967616   

                                                     text           user  \
0       Breaking: At least 10 dead, 5 injured after tO...     H_E_Samuel   
1       @H_E_Samuel @George_Berridge @michael_taggart ...   EdwardBowden   
2       @H_E_Samuel Hi Henry would you be willing to g...   NickyRusmith   
3       @H_E_Samuel @H_E_Samuel please call them terro...        pravsly   
4       @H_E_Samuel French govt needs to take strict a...      sharatsrs   
...            