In [3]:
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join
from ast import literal_eval
import re
from tqdm import tqdm

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97.5% !important; }</style>"))

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# What this file does:

1. Label each tweet as a CT tweet or not (Naive approach)
    * Contains Hashtags?
    * Contains Links?
2. Clean Tweets for BERT
    * Strings are converted to lists of words
    * ['CLS'] at the beginning, ['SEP'] at the end
    * All CT hashtags are removed
    * All NON-CT hashtags have the pound sign removed
    * All '@' symbols are removed
3. Save a new .json file for BERT input
    * File has two columns:
        * Cleaned Tweet
        * CT Label (1:CT, 0:Not CT)

## Hashtag and Link Lists

In [5]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
#     'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'new world order',
    'wake up america',
    'open america now',
    'fire fauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    'corona hoax',
]

CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com'
'gnews.org', 'wakingtimes.com', 'brighteon.com', 'thewallwillfall.org', 'sott.net',]


hashtag_set = set(['#' + tag for tag in general_conspiracy_hashtags])
keyword_set = set(keywords)

In [6]:
covid_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid1984',
    'plandemia',
    'firefauci',
    'coronahoax'
]

covid_keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'fire fauci',
    'coronahoax',
    'corona hoax',
]


In [7]:
NON_covid_conspiracy_hashtags = list( set(general_conspiracy_hashtags) - set(covid_conspiracy_hashtags) )

NON_covid_keywords = list( set(keywords) - set(covid_keywords) )

## Columns to aggregate

In [8]:
link_tag_keyword_cols = []

for link in CT_link_list:
    link_tag_keyword_cols.append(f'Link - {link}')      
for hashtag in general_conspiracy_hashtags:
    link_tag_keyword_cols.append(f'Tag - {hashtag}')
for keyword in keywords:
    link_tag_keyword_cols.append(f'Keyword - {keyword}')

# COVID_link_tag_keyword_cols = []
COVID_tag_keyword_cols = []

# for link in CT_link_list:
#     COVID_link_tag_keyword_cols.append(f'Link - {link}')      
for hashtag in covid_conspiracy_hashtags:
    COVID_tag_keyword_cols.append(f'Tag - {hashtag}')
for keyword in covid_keywords:
    COVID_tag_keyword_cols.append(f'Keyword - {keyword}')

    
    
# IF I WANT DUMMIES BROKEN DOWN BY ('POST', 'RETWEET', 'TOTAL')!!!!!
# for link in CT_link_list:
#     for tweet_type in ('POST', 'RETWEET', 'TOTAL'):
#         link_tag_keyword_cols.append(f'Link - {link} - {tweet_type}')      
# for hashtag in general_conspiracy_hashtags:
#     for tweet_type in ('POST', 'RETWEET', 'TOTAL'):
#         link_tag_keyword_cols.append(f'Tag - {hashtag} - {tweet_type}')


In [9]:
def hashtag_in_list(list_of_hashtags_in_tweet):
    return any(hashtag.upper() in [tag.upper() for tag in list_of_hashtags_in_tweet] for hashtag in general_conspiracy_hashtags)

## Clean Tweet Function

In [14]:
def clean_tweet(tweet):
    '''
    input = tweet (str)
    output = cleaned_tweet(str)
    '''
    
#     return [['CLS']] + [x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)] + [['SEP']]
    try:
        return ' '.join([x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)])
    except:
        return ''

# Final Function

In [None]:
list_of_non_english_users = []

In [None]:
relevant_columns = [
    'date',
    'hashtags',
    'urls',
    'tweet',
    'language'
]

datatypes = {'urls':'str', 'tweet':'str', 'language':'float64'}

re_escape_keywords = '|'.join([re.escape(word) for word in keywords])
re_escape_links = '|'.join([re.escape(link) for link in CT_link_list])

In [None]:

# def label_CT_tweets(df):
    
#     return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].apply(hashtag_in_list)).astype(int)


In [7]:
def add_CT_dummies(READ_PATH_ROOT, DF_SAVE_PATH_ROOT, BERT_GENERAL_CT_INPUT_SAVE_PATH_ROOT, BERT_COVID_CT_INPUT_SAVE_PATH_ROOT, BERT_NON_COVID_CT_INPUT_SAVE_PATH_ROOT, USER):
    
    READ_PATH = READ_PATH_ROOT + USER + '_TWEETS.csv'
    DF_SAVE_PATH = DF_SAVE_PATH_ROOT + USER + '_CLEANED_TWEETS.csv'
    BERT_GENERAL_CT_INPUT_SAVE_PATH = BERT_GENERAL_CT_INPUT_SAVE_PATH_ROOT + USER + 'GEN_CT_BERT_INPUT.csv'
    BERT_COVID_CT_INPUT_SAVE_PATH = BERT_COVID_CT_INPUT_SAVE_PATH_ROOT + USER + 'COV_CT_BERT_INPUT.csv'
    BERT_NON_COVID_CT_INPUT_SAVE_PATH = BERT_NON_COVID_CT_INPUT_SAVE_PATH_ROOT + USER + 'NON_COV_CT_BERT_INPUT.csv'
    
    df = pd.read_csv(READ_PATH, parse_dates=['date'], usecols=relevant_columns, converters={'hashtags': eval}, dtype=datatypes)#.set_index('retweet')
    
#     print(df[ df['tweet'].isnull() ]['tweet'])
    
#     df.query('language == 1', inplace=True)

    df.insert(loc=1, column='Cleaned Tweet', value=df['tweet'].apply(clean_tweet))
    
    df['tweet'] = df['tweet'].fillna('')
    
    df = df[df['tweet'].str.split().map(len) > 2] #only keep tweets with more than 2 words

#     df = df[ df['Cleaned Tweet'].map(len) > 4 ] #only keep tweets with more than 2 words (cleaned tweets start and end with tokens)

#         del df['Unnamed: 0']

#     df['hashtags'] = df['hashtags'].apply(literal_eval)

    # create dummies for each hashtag and link (in each tweet)    
    for link in CT_link_list:
#         df[f'Link - {link}'] = df['urls'].str.contains(link, case=False).astype(int)
        df[f'Link - {link}'] = df['urls'].str.contains(re.escape(link), case=False).astype(int)

    for tag in general_conspiracy_hashtags:
#         df[f'Tag - {tag}'] = df['hashtags'].apply(hashtag_in_list).astype(int)
        df[f'Tag - {tag}'] = df['hashtags'].apply(hashtag_in_list).astype(int)

    for keyword in keywords:
#         df[f'Keyword - {keyword}'] = df['tweet'].str.contains(keyword, case=False).astype(int)
        df[f'Keyword - {keyword}'] = df['tweet'].str.contains(re.escape(keyword), case=False).astype(int)

    df.insert(loc=0, column='CT Tweet (Dummy)', value=(df[link_tag_keyword_cols].sum(axis=1) > 0).astype(int))
    df.insert(loc=0, column='COVID-SPECIFIC CT Tweet (Dummy)', value=(df[COVID_tag_keyword_cols].sum(axis=1) > 0).astype(int))
    df.insert(loc=2, column='NON-COVID CT Tweet (Dummy)', value=(df['CT Tweet (Dummy)'] - df['COVID-SPECIFIC CT Tweet (Dummy)']).astype(int))


#     somewhat unrelated - fix broken 'retweet' boolean column
    df['retweet'] = df['tweet'].str.startswith('RT').astype(int)

#     return df

    df.to_csv(DF_SAVE_PATH, index=False)

    df[(df['language']==1) & (df['CT Tweet (Dummy)']==1)][['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'}).to_csv(BERT_GENERAL_CT_INPUT_SAVE_PATH, index=False)
    
    df[(df['language']==1) & (df['COVID-SPECIFIC CT Tweet (Dummy)']==1)][['Cleaned Tweet', 'COVID-SPECIFIC CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'COVID-SPECIFIC CT Tweet (Dummy)':'label'}).to_csv(BERT_COVID_CT_INPUT_SAVE_PATH, index=False)
    
    df[(df['language']==1) & (df['NON-COVID CT Tweet (Dummy)']==1)][['Cleaned Tweet', 'NON-COVID CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'NON-COVID CT Tweet (Dummy)':'label'}).to_csv(BERT_NON_COVID_CT_INPUT_SAVE_PATH, index=False)

#         return df[['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'})

In [None]:
# TEST_PATH = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/New Split Tweet Lookups/Split 10/3254815086_TWEETS.csv"

# TEST_PATH_ROOT = "C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/New Split Tweet Lookups/Split 10/"
# USER = "570927672"

# test = pd.read_csv(TEST_PATH)

# %%time 

# fnc_test = add_CT_dummies(READ_PATH_ROOT=TEST_PATH_ROOT, DF_SAVE_PATH_ROOT="0", BERT_INPUT_SAVE_PATH_ROOT="0", USER=USER)

# fnc_test[fnc_test['CT Tweet (Dummy)']==1]

# Path Root to Create .CSVs for BERT Input

sentence1, label

In [None]:
# BERT_CSV_folder_path = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/BERT/BERT Input CSVs/"

# Apply Final Function

## New ARC Version

In [None]:
ARC_TWEETS_READ_PATH_ROOT = r"/arc/project/st-tlemieux-1/data/Twint-Data/Mikhael/New-Splits/Split 8/" #SPECIFY SUB-FOLDER 
# e.g. /arc/project/st-tlemieux-1/data/Twint-Data/Luke/New-Splits/Split <X>" for X=[0:19]

ARC_CLEANED_WRITE_PATH_ROOT = r"/scratch/st-tlemieux-1/lfrymire/cleaned_and_labeled_tweets/" #SPECIFY SUB-FOLDER

ARC_GEN_CT_BERT_INPUT_SAVE_PATH_ROOT = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/GEN-CT-Tweets/"

ARC_COV_CT_BERT_INPUT_SAVE_PATH_ROOT = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/COV-CT-Tweets/"

ARC_NON_COVID_CT_BERT_INPUT_SAVE_PATH_ROOT = r"/scratch/st-tlemieux-1/lfrymire/BERT-input/NON-COV-CT-Tweets/"



USERS_WITH_TWEETS = [int(f.split('_')[0]) for f in listdir(ARC_TWEETS_READ_PATH_ROOT) if isfile(join(ARC_TWEETS_READ_PATH_ROOT, f))]

for user in tqdm(USERS_WITH_TWEETS):
    add_CT_dummies(READ_PATH_ROOT=ARC_TWEETS_READ_PATH_ROOT,
                   DF_SAVE_PATH_ROOT=ARC_CLEANED_WRITE_PATH_ROOT,
                   BERT_GENERAL_CT_INPUT_SAVE_PATH_ROOT=ARC_GEN_CT_BERT_INPUT_SAVE_PATH_ROOT,
                   BERT_COVID_CT_INPUT_SAVE_PATH_ROOT=ARC_COV_CT_BERT_INPUT_SAVE_PATH_ROOT,
                   BERT_NON_COVID_CT_INPUT_SAVE_PATH_ROOT=ARC_NON_COVID_CT_BERT_INPUT_SAVE_PATH_ROOT,
                   USER=str(user))
    

## Pre-Arc Version

In [None]:
# # folder containing folders of all split user lookups
# OG_READ_PATH_ROOT = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/All Conspiracy Tweeters' Tweets/"

# # folder which will contain folders of all split user lookup AGGREGATIONS (weekly CT activity)
# OG_WRITE_PATH_ROOT = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/Labeled OG Tweets/"

# og_splits = [5,6,7,8,9]

# for split in og_splits:
#     local_read_path_root = OG_READ_PATH_ROOT + f'Split {split}/'
#     local_write_path_root = OG_WRITE_PATH_ROOT + f'Split {split}/'
    
#     # make new folders to store processed tweets
#     if not os.path.exists(local_write_path_root):
#         os.makedirs(local_write_path_root)
    
#     # find all users who have been searched and stored in this folder
#     users_in_this_split = [int(f.split('_')[0]) for f in listdir(local_read_path_root) if isfile(join(local_read_path_root, f))]

#     # aggregate each user's tweets
#     for user in users_in_this_split:
#         print(user)
#         add_CT_dummies(READ_PATH_ROOT=local_read_path_root, DF_SAVE_PATH_ROOT=local_write_path_root, BERT_INPUT_SAVE_PATH_ROOT=BERT_CSV_folder_path, USER=str(user))