# Fake Tweets Detection using Concurrent Neural Networks

We need to grab the sample dataset we're using in this research.

In [1]:
! mkdir -p ./dataset
! wget "https://ndownloader.figshare.com/files/11767817" -O "./dataset/pheme_veracity.tar.bz2"

--2020-03-01 13:55:02--  https://ndownloader.figshare.com/files/11767817
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 52.17.168.209, 54.229.248.2, 34.252.157.212, ...
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|52.17.168.209|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/11767817/PHEME_veracity.tar.bz2 [following]
--2020-03-01 13:55:03--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/11767817/PHEME_veracity.tar.bz2
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.106.59
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.106.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46529729 (44M) [binary/octet-stream]
Saving to: ‘./dataset/pheme_veracity.tar.bz2’


2020-03-01 13:55:08 (10.3 MB/s) - ‘./dataset/pheme_veracity.tar.bz2’ saved [46529729/46529729]



In [2]:
! tar xC ./dataset -f ./dataset/pheme_veracity.tar.bz2

Let's start cleaning up the dataset. Because we're not using the thread based annotation system used in this dataset, we can go ahead and flatten the folder structure.

In [3]:
! mkdir -p ./flatten1
! rsync -a ./dataset/**/**/non-rumours/* ./flatten1
! rsync -a ./dataset/**/**/rumours/* ./flatten1

In [4]:
from pathlib import Path

rootdir = Path('./flatten1')
tweet_folders = [f for f in rootdir.glob('*') if f.is_dir()]

PHEME Project has helpfully provided a Python method to convert the annotations into "Verified True", "Verified False" and "Unverified" tags.

In [5]:
def convert_annotations(annotation, string = True):
    if 'misinformation' in annotation.keys() and 'true'in annotation.keys():
        if int(annotation['misinformation'])==0 and int(annotation['true'])==0:
            if string:
#                 label = "unverified"
                label = None
            else:
                label = 2
        elif int(annotation['misinformation'])==0 and int(annotation['true'])==1 :
            if string:
                label = "true"
            else:
                label = 1
        elif int(annotation['misinformation'])==1 and int(annotation['true'])==0 :
            if string:
                label = "false"
            else:
                label = 0
        elif int(annotation['misinformation'])==1 and int(annotation['true'])==1:
            label = None
            
    elif 'misinformation' in annotation.keys() and 'true' not in annotation.keys():
        # all instances have misinfo label but don't have true label
        if int(annotation['misinformation'])==0:
            if string:
#                 label = "unverified"
                label = None
            else:
                label = 2
        elif int(annotation['misinformation'])==1:
            if string:
                label = "false"
            else:
                label = 0
                
    elif 'true' in annotation.keys() and 'misinformation' not in annotation.keys():
        label = None
    else:
        label = None
           
    return label

In [6]:
import json

def get_source_tweet_path(tweet_id):
    return Path('./flatten1/' + tweet_id + '/source-tweets/' + tweet_id + '.json')

def get_annotation_path(tweet_id):
    return Path('./flatten1/' + tweet_id + '/annotation.json')

def parse_tweet(tweet_id):
    source_tweet_path = get_source_tweet_path(tweet_id)
    annotation_path = get_annotation_path(tweet_id)
    
    with open(annotation_path) as f:
        raw_annotation = json.load(f)
        annotation = convert_annotations(raw_annotation)
    
    with open(source_tweet_path) as f:
        raw_tweet = json.load(f)
        parsed_tweet = {}
        
        parsed_tweet["id"] = raw_tweet["id"]
        
        # Style
        parsed_tweet["text"] = raw_tweet["text"]
        parsed_tweet["num_mentions"] = len(raw_tweet["entities"]["user_mentions"])
        parsed_tweet["num_hashtags"] = len(raw_tweet["entities"]["hashtags"])
        parsed_tweet["num_urls"] = len(raw_tweet["entities"]["urls"])
        
        if "media" in raw_tweet["entities"].keys():
            parsed_tweet["has_media"] = True
        else:
            parsed_tweet["has_media"] = False
        
        # Propagation
        parsed_tweet["num_likes"] = raw_tweet["favorite_count"]
        parsed_tweet["num_retweets"] = raw_tweet["retweet_count"]
        
        # Credibility
        parsed_tweet["user_verified"] = raw_tweet["user"]["verified"]
        parsed_tweet["user_no_profile_image"] = raw_tweet["user"]["default_profile_image"]
        parsed_tweet["user_num_friends"] = raw_tweet["user"]["friends_count"]
        parsed_tweet["user_num_followers"] = raw_tweet["user"]["followers_count"]
        parsed_tweet["user_num_lists"] = raw_tweet["user"]["listed_count"]
        parsed_tweet["user_num_tweets"] = raw_tweet["user"]["statuses_count"]
        parsed_tweet["user_num_friends"] = raw_tweet["user"]["friends_count"]
        parsed_tweet["user_num_favourite_tweets"] = raw_tweet["user"]["favourites_count"]
        parsed_tweet["user_protected"] = raw_tweet["user"]["protected"]
        
        if raw_tweet["coordinates"] != None:
            parsed_tweet["has_location"] = True
        else:
            parsed_tweet["has_location"] = False
        
        # For filtering
        parsed_tweet["language"] = raw_tweet["lang"]
        
        parsed_tweet["annotation"] = annotation
        return parsed_tweet

In [7]:
tweets = [parse_tweet(tweet_folder.name) for tweet_folder in tweet_folders if tweet_folder.exists()]

In [8]:
import pandas as pd

tweets_df = pd.DataFrame.from_dict(tweets)

In [9]:
tweets_df.dropna(inplace=True)

In [10]:
# Drop non-English tweets as well as the 'language' column
non_en_indexes = tweets_df[ tweets_df["language"] != "en" ].index
tweets_df.drop(index = non_en_indexes, columns=['language'], inplace=True)

In [11]:
tweets_df.describe()

Unnamed: 0,id,num_hashtags,num_likes,num_mentions,num_retweets,num_urls,user_num_favourite_tweets,user_num_followers,user_num_friends,user_num_lists,user_num_tweets
count,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0,1705.0
mean,5.431026e+17,0.843988,126.095601,0.203519,287.282111,0.531378,2245.202346,1668577.0,3525.85044,18463.1,57486.170674
std,1.789267e+16,0.930855,1701.099923,0.512965,1023.60035,0.516495,11002.962004,3881032.0,16372.334001,64505.7,61227.214167
min,4.982543e+17,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,4.0
25%,5.250674e+17,0.0,14.0,0.0,100.0,0.0,15.0,12530.0,296.0,262.0,11447.0
50%,5.443001e+17,1.0,36.0,0.0,145.0,1.0,161.0,141842.0,570.0,1966.0,37117.0
75%,5.528345e+17,1.0,82.0,0.0,266.0,1.0,843.0,893725.0,1644.0,13365.0,94820.0
max,5.815507e+17,7.0,69825.0,6.0,37264.0,2.0,208101.0,25299870.0,453460.0,2275623.0,621399.0


In [12]:
tweets_df.head()

Unnamed: 0,annotation,has_location,has_media,id,num_hashtags,num_likes,num_mentions,num_retweets,num_urls,text,user_no_profile_image,user_num_favourite_tweets,user_num_followers,user_num_friends,user_num_lists,user_num_tweets,user_protected,user_verified
5,True,False,False,553585504093224962,1,165,0,371,1,Suspected #CharlieHebdo killers are in same ji...,False,13,22719054,113,160916,37210,False,True
6,False,False,False,544277728930062336,0,56,0,251,0,SYDNEY AIRSPACE CLOSED,False,299,201996,3446,2228,21817,False,False
9,True,False,False,544519622389956609,1,110,0,107,0,thank god #SydneySiege is over. so much respe...,False,20745,224483,7810,1298,53735,False,False
13,True,False,False,580327336505176064,0,44,0,383,1,"Germanwings plane crashes in France, up to 150...",False,841,6446943,1053,91710,117720,False,True
16,False,False,True,552981833189969921,1,134,0,233,0,Nice work from Banksy #JeSuisCharlie http://t....,False,1025,119372,222,837,3795,False,False


We have parsed the tweet content and the "True", "False" labels for each tweet. Let's write it to a csv.

In [13]:
tweets_df.to_csv("tweets.csv", index=False)