In [1]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [6]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [7]:
CLEAN_TWEETS_READ_PATH = r"/arc/project/st-tlemieux-1/data/Cleaned-Labeled-Tweets/"

MERGED_TWEET_CHUNK_SAVE_PATH = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/FINAL-INPUTS/ALL-TWEETS/" #this should be a folder containing three folders - one for each type of CT tweet (make sure it ends in a slash)

# make subfolders:
for type_of_tweets in ["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:
    
    if not os.path.exists(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/'):
        os.makedirs(MERGED_TWEET_CHUNK_SAVE_PATH + type_of_tweets + '/')

In [8]:
def get_files_in_folder(folder):
    
    return [f for f in listdir(folder) if isfile(join(folder, f))]

In [5]:
#users = get_files_in_folder(CLEAN_TWEETS_READ_PATH)

In [9]:
%store -r users

## Get subset of geolocated users

In [None]:
geolocated_users = pd.read_csv(r"/arc/project/st-tlemieux-1/data/FINAL-CLEANED-GEOLOCATED-USERS.csv")

users = np.intersect1d(np.array(users), geolocated_users['ID'])

## Clean Tweets

In [3]:
non_CT_words_and_tags = [
    'essentialworkers',
    'pfizerproud',
    'vaccineswork',
    'sciencewillwin',
    'flattenthecurve',
    'maskssavelives'
]

non_CT_words_and_tags = non_CT_words_and_tags + [f'#{x}' for x in non_CT_words_and_tags]


general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
#     'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    
    'essentialworkers',
    'pfizerproud',
    'vaccineswork',
    'sciencewillwin'
]

keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'new world order',
    'wake up america',
    'open america now',
    'fire fauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    'corona hoax',
]

CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com',
'gnews.org', 'wakingtimes.com', 'brighteon.com','thewallwillfall.org','sott.net',]


hashtag_set = set(['#' + tag.upper() for tag in general_conspiracy_hashtags])
keyword_set = set([x.upper() for x in keywords])

# re_escape_keywords = '|'.join([re.escape(word) for word in keywords])
# re_escape_links = '|'.join([re.escape(link) for link in CT_link_list])

def clean_tweet(tweet):
    '''
    input = tweet (str)
    output = cleaned_tweet(str)
    '''
    return ' '.join([x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x.upper() in keyword_set or x.upper() in hashtag_set or x.upper() in non_CT_words_and_tags)])

In [10]:
cols_to_use = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)',
       'NON-COVID CT Tweet (Dummy)', 'date', 'Cleaned Tweet',
       'language', 'retweet']

bool_cols = ['COVID-SPECIFIC CT Tweet (Dummy)', 'CT Tweet (Dummy)','NON-COVID CT Tweet (Dummy)', 'retweet']
str_cols = ['date', 'Cleaned Tweet']

float_cols = ['language']

bool_cols_dict = {col:np.bool_ for col in bool_cols}
str_cols_dict = {col:str for col in str_cols}
float_cols_dict = {col:np.float16 for col in float_cols}

datatypes = {**bool_cols_dict, **str_cols_dict, **float_cols_dict}

In [43]:
type_to_label_dict = {"GEN-CT-Tweets":'CT Tweet (Dummy)',
                      "COV-CT-Tweets":'COVID-SPECIFIC CT Tweet (Dummy)',
                      "NON-COV-CT-Tweets": 'NON-COVID CT Tweet (Dummy)'}
               
def read_and_process_df(USER_ID, READ_PATH, type_of_tweet, bad_list):
    try:
    
        df = pd.read_csv(READ_PATH, usecols=cols_to_use, dtype=datatypes)

        df['user_id'] = USER_ID
        
        df = df[df['language']==1]
        
        del df['language']
        
        df['Cleaned Tweets'] = df['Cleaned Tweets'].apply(clean_tweet)

        return df
    
    except:
        
        bad_list.append(USER_ID)

In [None]:
# NUM_CHUNKS = 40
# user_chunk_splits = np.array_split(users, NUM_CHUNKS)

USERS_PER_CHUNK = 4000
user_chunk_splits = np.array_split(users, USERS_PER_CHUNK)

unreadable_users = []

for type_of_tweets in ["COV-CT-Tweets"]: #["GEN-CT-Tweets", "COV-CT-Tweets", "NON-COV-CT-Tweets"]:

    for i, chunk in tqdm(enumerate(range(0, len(user_chunk_splits)))):
        
        if os.path.exists(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}"):
            continue

        chunk_read_paths = [CLEAN_TWEETS_READ_PATH + user for user in user_chunk_splits[i]]

        users_and_paths = zip(user_chunk_splits[i], chunk_read_paths)
        
#         list_of_user_tweet_dfs = [read_and_process_df(USER_ID=user_path_tup[0],
#                                                     READ_PATH=user_path_tup[1],
#                                                     type_of_tweet=type_of_tweets,
#                                                     bad_list=unreadable_users) for user_path_tup in users_and_paths]
#         pd.concat(list_of_user_tweet_dfs).rename(columns={'Cleaned Tweet':'sentence1', type_to_label_dict[type_of_tweet]:'label'}).to_csv(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}", index=False)

    
        pd.concat([read_and_process_df(USER_ID=user_path_tup[0],
                                                    READ_PATH=user_path_tup[1],
                                                    type_of_tweet=type_of_tweets,
                                                    bad_list=unreadable_users) for user_path_tup in users_and_paths]).rename(columns={'Cleaned Tweet':'sentence1', type_to_label_dict[type_of_tweets]:'label'}).to_csv(fr"{MERGED_TWEET_CHUNK_SAVE_PATH}{type_of_tweets}/MERGED_CHUNK_{i}.csv", index=False)

In [59]:
test = read_and_process_df(USER_ID=2440925667,
                    READ_PATH=r'/arc/project/st-tlemieux-1/data/Cleaned-Labeled-Tweets/2440925667_CLEANED_TWEETS.csv', 
                    type_of_tweet='COV-CT-Tweets', 
                    bad_list=[])

In [62]:
test[test['COVID-SPECIFIC CT Tweet (Dummy)']==1]

Unnamed: 0,COVID-SPECIFIC CT Tweet (Dummy),CT Tweet (Dummy),NON-COVID CT Tweet (Dummy),date,Cleaned Tweet,retweet,user_id
2426,True,True,False,2021-02-23 10:19:29,"Fauci is a better actor than most of Hollywood! In other news, FireFauci",False,2440925667
10493,True,True,False,2020-07-27 08:29:07,BLM is made up of primarily rich white basement dwellers w daddy's money WakeUpAmerica,False,2440925667
