**Pheme Dataset**

5222 Tweets.
The Pheme Dataset used (from https://figshare.com/articles/dataset/PHEME_dataset_for_Rumour_Detection_and_Veracity_Classification/6392078), is used for rumour detection, veracity classification and stance detection. For stance there were no labels for Pheme that I could find. That's why only rumour and veracity are included. While rumour and non-rumour were straightforward the veracity value wasn't always given for rumours ('unverified') and never for non-rumours (assumed true). There is a lot of additional data given for each tweet, like meta-data or reactions. That is left out here.


```
LABEL:
TRUE - 0
FALSE ('Fakenews') - 1
UNVERIFIED - 2
```
```
RUMOUR
'non-rumour'- 0
'rumours' - 1
```

In [6]:
import os
import sys
import pandas as pd
from prep_collection import PrepCollection as prep
import numpy as np
import json
from tweetLoader import TweetLoader

In [7]:
wdr_path = os.path.dirname(os.path.dirname(os.getcwd()))
ds_raw_path = os.path.join(wdr_path + "/Datasets/Fake News/all-rnr-annotated-threads")

In [8]:
def preprocess_pheme(wdr_path, ds_raw_path):
    df = pd.DataFrame(columns= ['tweetID', 'text', 'label', 'rumour'])
    tweet = TweetLoader()
    topics = [ele for ele in os.listdir(ds_raw_path) if '.' not in ele]

    # Non-rumours

    for topic in topics:
        articles_nr = [ele for ele in os.listdir(os.path.join(ds_raw_path, topic, 'non-rumours')) if '.' not in ele] # List of all non-roumor articles
        tweets_non_rumour = tweet.fetch_list(articles_nr) # pandas Dataframe, used Tomasz's class
        tweets_non_rumour['label'] = 0 # since there are no veracity labels for non-rumours, assumed to be true
        tweets_non_rumour['rumour'] = 0
        df = pd.concat([df, tweets_non_rumour])

        # rumours
        articles_r = [ele for ele in os.listdir(os.path.join(ds_raw_path, topic, 'rumours')) if '.' not in ele]
        tweets_rumour = tweet.fetch_list(articles_r)
        # to get the labels the annotation.json need to be parsed:
        label = []
        for article in tweets_rumour.loc[:]['tweetID']:
            with open(os.path.join(ds_raw_path, topic, 'rumours', str(article), 'annotation.json'), 'r') as f:
                annotation = json.load(f)
            try:
                veracity = annotation['true']
                if veracity == 0 or veracity == str(0):
                    label.append(1)
                elif veracity == 1 or veracity == str(1):
                    label.append(0)
                else:
                    print(annotation)
                    raise ValueError
            except KeyError: # If the truth is undecided
                label.append(2)
        tweets_rumour['label'] = label
        tweets_rumour['rumour'] = 1
        df = pd.concat([df, tweets_rumour])
    df['text'] = df['text'].apply(prep.prepare_text)
    df.to_csv(os.path.join(wdr_path + "/Preprocessed_Datasets/012-Pheme.csv"))
    return df

In [9]:
df = preprocess_pheme(wdr_path, ds_raw_path)

  0%|          | 0/0.77 [00:00<?, ?it/s]

  0%|          | 0/0.61 [00:00<?, ?it/s]

  0%|          | 0/16.21 [00:00<?, ?it/s]

  0%|          | 0/4.58 [00:00<?, ?it/s]

  0%|          | 0/8.59 [00:00<?, ?it/s]

  0%|          | 0/2.84 [00:00<?, ?it/s]

  0%|          | 0/4.2 [00:00<?, ?it/s]

  0%|          | 0/4.7 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/0.14 [00:00<?, ?it/s]

  0%|          | 0/2.31 [00:00<?, ?it/s]

  0%|          | 0/2.38 [00:00<?, ?it/s]

  0%|          | 0/1.12 [00:00<?, ?it/s]

  0%|          | 0/1.26 [00:00<?, ?it/s]

  0%|          | 0/6.99 [00:00<?, ?it/s]

  0%|          | 0/5.22 [00:00<?, ?it/s]

  0%|          | 0/0.04 [00:00<?, ?it/s]

  0%|          | 0/2.29 [00:00<?, ?it/s]