# Filter posts related to the 2016 U.S. presidential election

python=3.11

2016 POTUS Election


In [None]:
import pandas as pd
from joblib import Parallel, delayed  
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')

## sift

In [None]:
keywords = ["MAGA", "MakeAmericaGreatAgain", "Trump", "AmericaFirst", "Make America Great Again", \
             "Hillary", "Clinton", "ImWithHer", "OHHillYes", \
             "election2016", "elections2016", "debates2016"]


def filter(csv_file_index, keywords, source_path, target_path, result_file_suffix):
    df = pd.read_csv(source_path + str(csv_file_index) + ".csv") 
    df_f1 = df[df['text'].str.contains('|'.join(keywords), case=False)]  
    df_f2 = df[df['entities.hashtags'].astype(str).str.contains('|'.join(keywords), case=False)] 
    df_cat = pd.concat([df_f1,df_f2], axis=0, join='inner')  
    df_cat.sort_index(axis=0, ascending=True, inplace=True)  
    df_cat.drop_duplicates(subset='id', keep='first', inplace=True) 
    df_cat.to_csv(target_path + str(csv_file_index) + result_file_suffix + ".csv", index=False)  

In [None]:

begin = 1
end = 27
source_folder = "/mnt/data/Project7/fakenews/csv/"
target_folder = "data/csv_potus_election2016/"
suffix = "_potus_election2016"
Parallel(n_jobs=5)(delayed(filter)(ind, keywords, source_folder, target_folder, suffix) \
                              for ind in range(begin, end+1))

In [None]:

results = []
for i in range(1, 27+1):
    df = pd.read_csv(target_folder + str(i) + suffix + ".csv")
    results.append(df)
df_cat = pd.concat(results, axis=0)
df_cat.drop_duplicates(subset='id', keep='first', inplace=True)  
df_cat.reset_index(drop=True, inplace=True)  
df_cat.to_csv("data/full_data_potus_election2016.csv", index=False)  

## All posts related to the 2016 U.S. presidential election

In [5]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31784145 entries, 0 to 31784144
Data columns (total 83 columns):
 #   Column                                 Dtype  
---  ------                                 -----  
 0   id                                     int64  
 1   conversation_id                        int64  
 2   referenced_tweets.replied_to.id        float64
 3   referenced_tweets.retweeted.id         float64
 4   referenced_tweets.quoted.id            float64
 5   author_id                              int64  
 6   in_reply_to_user_id                    float64
 7   in_reply_to_username                   object 
 8   retweeted_user_id                      float64
 9   retweeted_username                     object 
 10  quoted_user_id                         float64
 11  quoted_username                        object 
 12  created_at                             object 
 13  text                                   object 
 14  lang                                   object 
 

In [None]:

authors = set(df_cat[df_cat['author.username'].notna()]['author.username'])
in_reply_to_users = set(df_cat[df_cat['in_reply_to_username'].notna()]['in_reply_to_username'])
retweeted_users = set(df_cat[df_cat['retweeted_username'].notna()]['retweeted_username'])
quoted_users = set(df_cat[df_cat['quoted_username'].notna()]['quoted_username'])
all_users = authors | in_reply_to_users | retweeted_users | quoted_users

print("authors:", len(authors))
print("in_reply_to_users:", len(in_reply_to_users))
print("retweeted_users:", len(retweeted_users))
print("quoted_users:", len(quoted_users))
print("all_users:", len(all_users))

authors: 4480831
in_reply_to_users: 672440
retweeted_users: 323879
quoted_users: 90634
all_users: 4688742


In [None]:
# Save the user collection
save_path = "pkl/"
dump(authors, save_path + "authors[type=set].pkl")
dump(in_reply_to_users, save_path + "in_reply_to_users[type=set].pkl")
dump(retweeted_users, save_path + "retweeted_users[type=set].pkl")
dump(quoted_users, save_path + "quoted_users[type=set].pkl")
dump(all_users, save_path + "all_users[type=set].pkl")

['pkl/all_users[type=set].pkl']

## English

In [4]:
df_en = df_cat[df_cat['lang'] == 'en']
df_en.shape

(30822858, 83)

In [None]:

#df_en.to_csv("data/full_data_potus_election2016[lang=en].csv", index=False)
df_en = pd.read_csv("data/full_data_potus_election2016[lang=en].csv")

In [None]:

#dump(df_en, "pkl/full_data_potus_election2016[lang=en][type=pd.DataFrame].pkl")
df_en = load("pkl/full_data_potus_election2016[lang=en][type=pd.DataFrame].pkl")

['pkl/full_data_potus_election2016[lang=en][type=pd.DataFrame].pkl']

In [None]:

authors = set(df_en[df_en['author.username'].notna()]['author.username'])
in_reply_to_users = set(df_en[df_en['in_reply_to_username'].notna()]['in_reply_to_username'])
retweeted_users = set(df_en[df_en['retweeted_username'].notna()]['retweeted_username'])
quoted_users = set(df_en[df_en['quoted_username'].notna()]['quoted_username'])

all_users = authors | in_reply_to_users | retweeted_users | quoted_users


print("authors:", len(authors))
print("in_reply_to_users:", len(in_reply_to_users))
print("retweeted_users:", len(retweeted_users))
print("quoted_users:", len(quoted_users))
print("all_users:", len(all_users))

authors: 4079151
in_reply_to_users: 614905
retweeted_users: 295984
quoted_users: 81622
all_users: 4257569


In [None]:

save_path = "pkl/"
dump(authors, save_path + "authors[lang=en][type=set].pkl")
dump(in_reply_to_users, save_path + "in_reply_to_users[lang=en][type=set].pkl")
dump(retweeted_users, save_path + "retweeted_users[lang=en][type=set].pkl")
dump(quoted_users, save_path + "quoted_users[lang=en][type=set].pkl")
dump(all_users, save_path + "all_users[lang=en][type=set].pkl")

['pkl/all_users[lang=en][type=set].pkl']

## October 10, 2016 to December 19, 2016

In [2]:
df_en = load("pkl/full_data_potus_election2016[lang=en][type=pd.DataFrame].pkl")

In [3]:
df_t = df_en[df_en['created_at'].str.contains('2016', case=False)]
df_t = df_t.sort_values(by='created_at').reset_index(drop=True)

In [4]:
df_t.loc[:667437, :].to_csv("data/2016.10.10_to_12.19_potus_election2016[lang=en].csv", index=False)
df_t.loc[:667440, ['created_at']]

Unnamed: 0,created_at
0,2016-10-10T13:46:17.000Z
1,2016-10-10T13:46:17.000Z
2,2016-10-10T13:46:18.000Z
3,2016-10-10T13:46:19.000Z
4,2016-10-10T13:46:20.000Z
...,...
667436,2016-12-19T23:59:54.000Z
667437,2016-12-19T23:59:54.000Z
667438,2016-12-20T00:00:04.000Z
667439,2016-12-20T00:00:22.000Z


In [5]:
df_t = df_t.loc[:667437, :]

In [None]:

authors = set(df_t[df_t['author.username'].notna()]['author.username'])

in_reply_to_users = set(df_t[df_t['in_reply_to_username'].notna()]['in_reply_to_username'])

retweeted_users = set(df_t[df_t['retweeted_username'].notna()]['retweeted_username'])

quoted_users = set(df_t[df_t['quoted_username'].notna()]['quoted_username'])

all_users = authors | in_reply_to_users | retweeted_users | quoted_users


print("authors:", len(authors))
print("in_reply_to_users:", len(in_reply_to_users))
print("retweeted_users:", len(retweeted_users))
print("quoted_users:", len(quoted_users))
print("all_users:", len(all_users))

authors: 264272
in_reply_to_users: 8957
retweeted_users: 15892
quoted_users: 3431
all_users: 268677


In [None]:

save_path = "pkl/"
suffix = "[time=16.10.10-16.12.19][lang=en][type=set]"
dump(authors, save_path + "authors" + suffix + ".pkl")
dump(in_reply_to_users, save_path + "in_reply_to_users" + suffix + ".pkl")
dump(retweeted_users, save_path + "retweeted_users" + suffix + ".pkl")
dump(quoted_users, save_path + "quoted_users" + suffix + ".pkl")
dump(all_users, save_path + "all_users" + suffix + ".pkl")

['pkl/all_users[time=16.10.10-16.12.19][lang=en][type=set].pkl']