# Collect Dataset in One Dataframe

In [1]:
import pandas as pd
import json
import glob
#import pyodbc

In [2]:
json_files = glob.glob("data/03-extracted_tweets/*.json")

In [3]:
tweet_ids = set()  # Create a set to store unique tweet IDs

In [4]:
tweets_data = []

for file in json_files:
    with open(file, 'r') as f:
        json_data = json.load(f)
        for tweet in json_data['data']:
            tweet_id = tweet['id']
            if tweet_id not in tweet_ids:  # Check if tweet ID is already processed                
                annotations_text = []
                annotations_type = []
                cashtags = []
                hashtags = []
                mentions = []
                urls = []
                urls_titles = []
                urls_desc = []
                context_domain = []
                context_domain_desc = []
                context_entity = []
                context_entity_desc = []
                copyright = ''
                country_codes = []
                
                
                if 'entities' in tweet:
                    annotations_text = [annotation['normalized_text'] for annotation in tweet['entities'].get('annotations', [])]
                    annotations_type = [annotation['type'] for annotation in tweet['entities'].get('annotations', [])]
                    cashtags = [cashtag['tag'] for cashtag in tweet['entities'].get('cashtags', [])]
                    hashtags = [hashtag['tag'] for hashtag in tweet['entities'].get('hashtags', [])]
                    mentions = [mention['username'] for mention in tweet['entities'].get('mentions', [])]
                    urls = [url['expanded_url'] for url in tweet['entities'].get('urls', [])]
                    try:
                        urls_titles = [url['title'] for url in tweet['entities'].get('urls', [])]
                        urls_desc = [url['description'] for url in tweet['entities'].get('urls', [])]
                    except:
                        urls_titles = []
                        urls_desc = []
                    
                if 'context_annotations' in tweet:
                    for item in tweet['context_annotations']:
                        #print(item, end='\n.....\n')
                        context_domain.append(item['domain']['name'])
                        context_domain_desc.append(item['domain'].get('description', ''))
                        context_entity.append(item['entity']['name'])
                        context_entity_desc.append(item['entity'].get('description', ''))
                    
                    
                if 'withheld' in tweet:
                    copyright = tweet['withheld']['copyright']
                    country_codes = tweet['withheld']['country_codes']
                    
                tweet_data = {
                    'tweet_id': tweet_id,
                    'possibly_sensitive': tweet['possibly_sensitive'],
                    'retweet_count': tweet['public_metrics']['retweet_count'],
                    'reply_count': tweet['public_metrics']['reply_count'],
                    'like_count': tweet['public_metrics']['like_count'],
                    'quote_count': tweet['public_metrics']['quote_count'],
                    'bookmark_count': tweet['public_metrics']['bookmark_count'],
                    'impression_count': tweet['public_metrics']['impression_count'],
                    'user_id': tweet['author_id'],
                    'lang': tweet['lang'],
                    'annotations_text': annotations_text,
                    'annotations_type': annotations_type,
                    'cashtags': cashtags,
                    'hashtags': hashtags,
                    'mentions': mentions,
                    'urls': urls,
                    'urls_titles': urls_titles,
                    'urls_desc': urls_desc,
                    'context_domain': context_domain,
                    'context_domain_desc': context_domain_desc,
                    'context_entity': context_entity,
                    'context_entity_desc': context_entity_desc,
                    'copyright': copyright,
                    'withheld_country_codes': country_codes,
                    'created_at': tweet['created_at'],
                    'edits_remaining': tweet['edit_controls']['edits_remaining'],
                    'is_edit_eligible': tweet['edit_controls']['is_edit_eligible'],
                    'conversation_id': tweet['conversation_id'],
                    'text': tweet['text'],
                    'reply_settings': tweet['reply_settings']
                }
                tweets_data.append(tweet_data)
                tweet_ids.add(tweet_id)  # Add tweet ID to the s

print('Done..............!')

Done..............!


In [5]:
len(tweets_data)

20090

In [6]:
df_tweets = pd.DataFrame(tweets_data)

In [7]:
df_tweets.shape

(20090, 30)

In [8]:
df_tweets.sample(5)

Unnamed: 0,tweet_id,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count,user_id,lang,...,context_entity,context_entity_desc,copyright,withheld_country_codes,created_at,edits_remaining,is_edit_eligible,conversation_id,text,reply_settings
15604,1588014103007830019,False,0,1,0,0,0,0,762189552,ar,...,[],[],,[],2022-11-03T03:44:00.000Z,5,True,1588014103007830019,إصابة نوير بمرض سرطان الجلد \n\n#جريدة_الراية\...,everyone
18734,598547668525158400,False,0,0,0,0,0,0,382187225,ar,...,[],[],,[],2015-05-13T17:57:39.000Z,5,True,598547668525158400,لحظة وقوع زلزال نيبال الجديد من أحد برك السباح...,everyone
12503,1599831657766621184,False,15,18,164,4,2,0,846334047383105537,ar,...,[],[],,[],2022-12-05T18:22:45.000Z,5,True,1599831657766621184,صحفي إسرائيلي يثير جدلًا بصورة للنجم محمد #صلا...,everyone
10047,1618225156136239104,False,0,0,3,0,0,730,1564235326360625153,ar,...,[],[],,[],2023-01-25T12:31:57.000Z,5,False,1617603243105783819,@SamiALkateb2 لوس أنجلوس 1950 تقريبا \nليش تكذ...,everyone
13511,1527937067497533440,False,0,1,4,0,0,0,96946480,ar,...,[],[],,[],2022-05-21T08:59:19.000Z,5,True,1527937067497533440,وزارة الداخلية @MOI_Qatar:\nالدفاع المدني يسيط...,everyone


In [9]:
df_tweets.columns

Index(['tweet_id', 'possibly_sensitive', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'bookmark_count', 'impression_count',
       'user_id', 'lang', 'annotations_text', 'annotations_type', 'cashtags',
       'hashtags', 'mentions', 'urls', 'urls_titles', 'urls_desc',
       'context_domain', 'context_domain_desc', 'context_entity',
       'context_entity_desc', 'copyright', 'withheld_country_codes',
       'created_at', 'edits_remaining', 'is_edit_eligible', 'conversation_id',
       'text', 'reply_settings'],
      dtype='object')

In [10]:
df_labels = pd.read_excel('data/04-VERA-ARB_Raw-labels.xlsx')

In [11]:
df_labels['tweet_id'] = df_labels['tweet_id'].astype('str')

In [12]:
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20084 entries, 0 to 20083
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet_id      20084 non-null  object
 1   claim         20084 non-null  int64 
 2   label         20084 non-null  object
 3   binary_label  20084 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 627.8+ KB


In [13]:
df_labels.head()

Unnamed: 0,tweet_id,claim,label,binary_label
0,1697950090223026335,0,misleading,1
1,1697935446666510767,1,misleading,1
2,1620816468328972288,595,misleading,1
3,1575998679198367744,975,false,1
4,1697673466311262549,2,false,1


In [14]:
df_tweets = df_tweets.merge(df_labels, on='tweet_id')

In [15]:
df_tweets.columns

Index(['tweet_id', 'possibly_sensitive', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'bookmark_count', 'impression_count',
       'user_id', 'lang', 'annotations_text', 'annotations_type', 'cashtags',
       'hashtags', 'mentions', 'urls', 'urls_titles', 'urls_desc',
       'context_domain', 'context_domain_desc', 'context_entity',
       'context_entity_desc', 'copyright', 'withheld_country_codes',
       'created_at', 'edits_remaining', 'is_edit_eligible', 'conversation_id',
       'text', 'reply_settings', 'claim', 'label', 'binary_label'],
      dtype='object')

In [16]:
df_tweets.shape

(20084, 33)

In [17]:
df_tweets.to_excel('data/04-tweets_data.xlsx', index=False)

# Collect Users Dataframe

In [18]:
users_data = []
user_ids = set()  # Create a set to store unique user IDs

for file in json_files:
    with open(file, 'r') as f:
        json_data = json.load(f)
        for user in json_data['includes']['users']:
            user_id = user['id']
            if user_id not in user_ids:  # Check if user ID is already processed
                if 'entities' in user:
                    try:
                        user_url = user['entities']['url']['urls'][0]['expanded_url']
                    except:
                        user_url = ''
                    try:
                        user_desc_urls = [url['expanded_url'] for url in user['entities']['description'].get('urls', [])]
                    except:
                        user_desc_urls = []
                    try:
                        user_desc_hashtags = [url['tag'] for url in user['entities']['description'].get('hashtags', [])]
                    except:
                        user_desc_hashtags = []
                    try:
                        user_desc_mentions = [url['tag'] for url in user['entities']['description'].get('mentions', [])]
                    except:
                        user_desc_mentions = []
                    try:
                        user_desc_cashtags = [url['tag'] for url in user['entities']['description'].get('cashtags', [])]
                    except:
                        user_desc_cashtags = []
                        
                
                user_data = {
                    'user_id': user_id,
                    'user_created_at': user['created_at'],
                    'user_description': user['description'],
                    
                    'user_description_urls': user_desc_urls,
                    'user_description_hashtags': user_desc_hashtags,
                    'user_description_mentions': user_desc_mentions,
                    'user_description_cashtags': user_desc_cashtags,
                    
                    'location': user['location'] if 'location' in user else '',
                    'display_name': user['name'],
                    'user_name': user['username'],
                    'pinned_tweet_id': user['pinned_tweet_id'] if 'pinned_tweet_id' in user else '',
                    'profile_image_url': user['profile_image_url'],
                    'protected': user['protected'],
                    'followers_count': user['public_metrics']['followers_count'],
                    'following_count': user['public_metrics']['following_count'],
                    'tweet_count': user['public_metrics']['tweet_count'],
                    'listed_count': user['public_metrics']['listed_count'],
                    'user_like_count': user['public_metrics']['like_count'],
                    'user_url': user_url,
                    'verified': user['verified'],
                    'user_copyright': copyright,
                    'user_withheld_country_codes': country_codes
                }
                users_data.append(user_data)
                user_ids.add(user_id)  # Add user ID to the set
print('Done..............!')

Done..............!


In [19]:
len(users_data)

13817

In [20]:
df_users = pd.DataFrame(users_data)

In [21]:
df_users.shape

(13817, 22)

In [22]:
df_users.sample(5)

Unnamed: 0,user_id,user_created_at,user_description,user_description_urls,user_description_hashtags,user_description_mentions,user_description_cashtags,location,display_name,user_name,...,protected,followers_count,following_count,tweet_count,listed_count,user_like_count,user_url,verified,user_copyright,user_withheld_country_codes
8281,1485516684731498500,2022-01-24T07:36:21.000Z,,[],"[البيتكوين, BTC]",[],[],,yasser saad,yassersaad2121,...,False,28,170,393,0,326,https://t.me/llChartistll,False,,[]
13783,877769129939415040,2017-06-22T06:04:03.000Z,إن لم تعجبك شخصيتي ليست مشكلتي فغيرك ♥ يعشقها ...,[],[من],[],[],الجزائر,Yahya Elhamel 👌 حفيد عقبة ابن نافع الكنتي👌,yahya_Elhamel,...,False,226,406,6119,0,19526,,False,,[]
12659,1487472741292912645,2022-01-29T17:08:58.000Z,رسالتي السلام لكل الناس,[],[],[],[],,العراقي العربي الرافضي,abn_zym,...,False,130,810,446,0,3047,http://www.omandaily.om,False,,[]
457,1197288263708479488,2019-11-20T22:59:18.000Z,‏‏‏‏‏‏‏‏‏‏‏‏‏‏‏‏‏‏_ سَلامٌ مِن صَبا بَرَدى أَر...,[],[],[],[],,خــاڵـــد مـــﺣ͠ــمـــد شــمـيطي 🕊️,abualisar18,...,False,746,23,45295,5,68150,https://www.facebook.com/mutasimbakheet/,False,,[]
280,930697437630615554,2017-11-15T07:22:15.000Z,الحمدلله ،، الاتحاد نادي الشعب 💛,[],[],[],[],المملكة العربية السعودية,سارة الزهراني,ittigirls1,...,False,23736,872,48299,23,1113,http://www.youm7.com,False,,[]


In [23]:
df_users.columns

Index(['user_id', 'user_created_at', 'user_description',
       'user_description_urls', 'user_description_hashtags',
       'user_description_mentions', 'user_description_cashtags', 'location',
       'display_name', 'user_name', 'pinned_tweet_id', 'profile_image_url',
       'protected', 'followers_count', 'following_count', 'tweet_count',
       'listed_count', 'user_like_count', 'user_url', 'verified',
       'user_copyright', 'user_withheld_country_codes'],
      dtype='object')

In [24]:
df_users.to_excel('data/04-users_data.xlsx', index=False)

# Merge Tweets & Users into One Dataset

In [25]:
df_dataset = pd.merge(df_tweets, df_users, on='user_id')

In [26]:
df_dataset.shape

(20084, 54)

In [27]:
df_dataset.sample(5)

Unnamed: 0,tweet_id,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count,user_id,lang,...,protected,followers_count,following_count,tweet_count,listed_count,user_like_count,user_url,verified,user_copyright,user_withheld_country_codes
12128,1400815162685280261,False,7,0,9,1,0,0,1153000502109593600,ar,...,False,86,458,581,0,3266,https://www.youtube.com/@Ahmed_news,False,,[]
10105,1569068262851170304,False,0,0,0,0,0,0,624888540,ar,...,False,25400,2259,250629,84,190,http://www.UAE71.com,False,,[]
11806,1112735550832812032,False,0,0,0,0,0,0,1097516110029697025,ar,...,False,52,9,3322,0,35,http://bit.ly/SNPYoutube,False,,[]
16132,1591933693916221441,False,1,0,5,0,0,0,1543639356086394881,ar,...,False,3335,4996,35004,4,29501,,False,,[]
1975,1639400515619389442,False,20,13,78,7,11,21499,1109573624711778305,ar,...,False,5874,352,12088,57,29895,http://algadtv.com,False,,[]


In [28]:
df_dataset.columns

Index(['tweet_id', 'possibly_sensitive', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'bookmark_count', 'impression_count',
       'user_id', 'lang', 'annotations_text', 'annotations_type', 'cashtags',
       'hashtags', 'mentions', 'urls', 'urls_titles', 'urls_desc',
       'context_domain', 'context_domain_desc', 'context_entity',
       'context_entity_desc', 'copyright', 'withheld_country_codes',
       'created_at', 'edits_remaining', 'is_edit_eligible', 'conversation_id',
       'text', 'reply_settings', 'claim', 'label', 'binary_label',
       'user_created_at', 'user_description', 'user_description_urls',
       'user_description_hashtags', 'user_description_mentions',
       'user_description_cashtags', 'location', 'display_name', 'user_name',
       'pinned_tweet_id', 'profile_image_url', 'protected', 'followers_count',
       'following_count', 'tweet_count', 'listed_count', 'user_like_count',
       'user_url', 'verified', 'user_copyright',
       'us

In [29]:
df_dataset.to_excel('data/04-VERA-ARB_Raw-dataset.xlsx', index=False)