In [1]:
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join
from ast import literal_eval

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97.5% !important; }</style>"))

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# What this file does:

1. Label each tweet as a CT tweet or not (Naive approach)
    * Contains Hashtags?
    * Contains Links?
2. Clean Tweets for BERT
    * Strings are converted to lists of words
    * ['CLS'] at the beginning, ['SEP'] at the end
    * All CT hashtags are removed
    * All NON-CT hashtags have the pound sign removed
    * All '@' symbols are removed
3. Save a new .json file for BERT input
    * File has two columns:
        * Cleaned Tweet
        * CT Label (1:CT, 0:Not CT)

## Hashtag and Link Lists

In [3]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
#     'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'new world order',
    'wake up america',
    'open america now',
    'fire fauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    'corona hoax',
]

CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com'
'gnews.org', 'wakingtimes.com', 'brighteon.com','thewallwillfall.org','sott.net',]


hashtag_set = set(['#' + tag for tag in general_conspiracy_hashtags])
keyword_set = set(keywords)

## Columns to aggregate

In [4]:
link_tag_keyword_cols = []

for link in CT_link_list:
    link_tag_keyword_cols.append(f'Link - {link}')      
for hashtag in general_conspiracy_hashtags:
    link_tag_keyword_cols.append(f'Tag - {hashtag}')
for keyword in keywords:
    link_tag_keyword_cols.append(f'Keyword - {keyword}')


# IF I WANT DUMMIES BROKEN DOWN BY ('POST', 'RETWEET', 'TOTAL')!!!!!
# for link in CT_link_list:
#     for tweet_type in ('POST', 'RETWEET', 'TOTAL'):
#         link_tag_keyword_cols.append(f'Link - {link} - {tweet_type}')      
# for hashtag in general_conspiracy_hashtags:
#     for tweet_type in ('POST', 'RETWEET', 'TOTAL'):
#         link_tag_keyword_cols.append(f'Tag - {hashtag} - {tweet_type}')


In [5]:
def hashtag_in_list(list_of_hashtags_in_tweet):
    return any(hashtag.upper() in [tag.upper() for tag in list_of_hashtags_in_tweet] for hashtag in general_conspiracy_hashtags)

## Clean Tweet Function

In [6]:
def clean_tweet(tweet):
    '''
    input = tweet (str)
    output = cleaned_tweet(str)
    '''
    
    return [['CLS']] + [x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)] + [['SEP']]

# Final Function

In [7]:
list_of_non_english_users = []

In [19]:
relevant_columns = [
    'date',
    'hashtags',
    'urls',
    'tweet',
    'language'
]

datatypes = {'urls':'str', 'tweet':'str', 'language':'float64'}

re_escape_keywords = '|'.join([re.escape(word) for word in keywords])
re_escape_links = '|'.join([re.escape(link) for link in CT_link_list])

In [9]:
import re

In [22]:
%load_ext Cython

In [21]:

def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].apply(hashtag_in_list)).astype(int)


In [42]:
%%cython
def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].apply(hashtag_in_list)).astype(int)



Error compiling Cython file:
------------------------------------------------------------
...
def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].to_numpy().apply(hashtag_in_list)).astype(int)
                                     ^
------------------------------------------------------------

C:\Users\crackcocaine69xxx\.ipython\cython\_cython_magic_9d5286ecb3440524a65faaf65cb9869c.pyx:3:38: undeclared name not builtin: re_escape_keywords

Error compiling Cython file:
------------------------------------------------------------
...
def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].to_numpy().apply(hashtag_in_list)).astype(int)
                                                                                               ^
--------------------------

In [39]:
def add_CT_dummies(READ_PATH_ROOT, DF_SAVE_PATH_ROOT, BERT_INPUT_SAVE_PATH_ROOT, USER):
    
    READ_PATH = READ_PATH_ROOT + USER + '_TWEETS.csv'
    DF_SAVE_PATH = DF_SAVE_PATH_ROOT + USER + '_CLEANED_TWEETS.csv'
    BERT_INPUT_SAVE_PATH = BERT_INPUT_SAVE_PATH_ROOT + USER + '_BERT_INPUT.csv'
    
    df = pd.read_csv(READ_PATH, parse_dates=['date'], usecols=relevant_columns, converters={'hashtags': eval}, dtype=datatypes)#.set_index('retweet')
    
#     if df['language'].sum() / len(df) < .1: #if less than 10% of tweets are recognized as English    
#         list_of_non_english_users.append(USER)
    
#     else:

    df.query('language == 1', inplace=True)
    
    df = df[df['tweet'].str.split().map(len) > 2]

    df.insert(loc=1, column='Cleaned Tweet', value=df['tweet'].apply(clean_tweet))

#     df = df[ df['Cleaned Tweet'].map(len) > 4 ] #only keep tweets with more than 4 words

#         del df['Unnamed: 0']

#     df['hashtags'] = df['hashtags'].apply(literal_eval)

    # create dummies for each hashtag and link (in each tweet)    
#     for link in CT_link_list:
#         df[f'Link - {link}'] = df['urls'].str.contains(link, case=False).astype(int)

#     for tag in general_conspiracy_hashtags:
#         df[f'Tag - {tag}'] = df['hashtags'].apply(hashtag_in_list).astype(int)

#     for keyword in keywords:
#         df[f'Keyword - {keyword}'] = df['tweet'].str.contains(keyword, case=False).astype(int)

    df.insert(loc=0, column='CT Tweet (Dummy)', value=(label_CT_tweets(df)))


#     somewhat unrelated - fix broken 'retweet' boolean column
#     df['retweet'] = df['tweet'].str.startswith('RT').astype(int)

    return df

    #df.to_csv(DF_SAVE_PATH, index=False)

    #df[df['language']==1][['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'}).to_csv(BERT_INPUT_SAVE_PATH, index=False)

#         return df[['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'})

In [28]:
TEST_PATH = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/New Split Tweet Lookups/Split 10/3254815086_TWEETS.csv"

TEST_PATH_ROOT = "C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/New Split Tweet Lookups/Split 10/"
USER = "570927672"

In [29]:
test = pd.read_csv(TEST_PATH)

In [40]:
%%time 

add_CT_dummies(READ_PATH_ROOT=TEST_PATH_ROOT, DF_SAVE_PATH_ROOT="0", BERT_INPUT_SAVE_PATH_ROOT="0", USER=USER)

Wall time: 4.34 s


Unnamed: 0,date,Cleaned Tweet,tweet,language,hashtags,urls
0,2021-06-23 09:05:25,"[[CLS], MSDhoni, spends, some, quality, time, with, daughter, Ziva, and, we, are, just, loving, this, beautiful, click, which, captures, the, father-daughter, bond, perfectly!, [SEP]]",#MSDhoni spends some quality time with daughter #Ziva and we are just loving this beautiful click which captures the father-daughter bond perfectly! https://t.co/shfAwdnYck,1.0,"[msdhoni, ziva]",[]
1,2021-06-23 09:03:17,"[[CLS], The, cabinet, approves, an, important, merger,, to, further, generate, more, employment, opportunities, and, promote, Ease, of, Doing, Business., CabinetDecisions, [SEP]]","The cabinet approves an important merger, to further generate more employment opportunities and promote Ease of Doing Business. #CabinetDecisions https://t.co/GvVBk1QuyP",1.0,[cabinetdecisions],[]
2,2021-06-23 09:03:08,"[[CLS], To, ensure, food, security, for, all, amidst, the, COVID19, pandemic,, Cabinet, approves, an, additional, 204, LMT, of, food, grains, for, a, period, of, further, 5, months, will, be, provided, to, NFSA, beneficiaries., CabinetDecisions, [SEP]]","To ensure food security for all amidst the #COVID19 pandemic, Cabinet approves an additional 204 LMT of food grains for a period of further 5 months will be provided to NFSA beneficiaries. #CabinetDecisions https://t.co/3ikj3PXLiR",1.0,"[covid19, cabinetdecisions]",[]
3,2021-06-23 09:02:52,"[[CLS], Cabinet, approves, a, new, agreement, between, India, and, Saint, Vincent, &amp;, The, Grenadines, to, facilitate, the, exchange, of, information, between, the, two, countries, and, to, provide, assistance, to, each, other, in, the, collection, of, tax, claims., CabinetDecisions, [SEP]]",Cabinet approves a new agreement between India and Saint Vincent &amp; The Grenadines to facilitate the exchange of information between the two countries and to provide assistance to each other in the collection of tax claims. #CabinetDecisions https://t.co/EwDk9qpEG7,1.0,[cabinetdecisions],[]
4,2021-06-23 09:02:42,"[[CLS], Reliance, Industries, Limited, will, host, its, 44th, Annual, General, Meeting, tomorrow., Here, are, some, key, announcements, JP, Morgan, expects, to, be, made, RIL, RILAGM, RILAGM2021, Reliance, [SEP]]",Reliance Industries Limited will host its 44th Annual General Meeting tomorrow. Here are some key announcements JP Morgan expects to be made #RIL #RILAGM #RILAGM2021 #Reliance https://t.co/lY727BLYWQ,1.0,"[ril, rilagm, rilagm2021, reliance]",['https://www.cnbctv18.com/business/companies/reliance-agm-here-are-some-announcements-jp-morgan-expects-to-be-made-on-june-24-9756341.htm']
...,...,...,...,...,...,...
181666,2019-01-01 03:53:15,"[[CLS], India, VIX, fell, 4.22, per, cent, to, 15.32, level., VIX, has, to, hold, below, 16, to, get, the, scope, to, surpass, its, immediate, barrier, at, 10,985., [SEP]]","India VIX fell 4.22 per cent to 15.32 level. VIX has to hold below 16 to get the scope to surpass its immediate barrier at 10,985.",1.0,[],[]
181667,2019-01-01 03:53:01,"[[CLS], There, was, fresh, Put, writing, was, at, strike, price, 10,800, while, Call, writing, was, seen, at, 11,100., The, option, band, signified, a, broader, trading, range, between, 10,777, and, 11,100, levels., [SEP]]","There was fresh Put writing was at strike price 10,800 while Call writing was seen at 11,100. The option band signified a broader trading range between 10,777 and 11,100 levels.",1.0,[],[]
181668,2019-01-01 03:52:57,"[[CLS], On, the, options, front,, maximum, Put, open, interest, was, at, 10,500, followed, by, 10,000, while, maximum, Call, OI, was, at, 11,200, followed, by, 11,000., [SEP]]","On the options front, maximum Put open interest was at 10,500 followed by 10,000 while maximum Call OI was at 11,200 followed by 11,000.",1.0,[],[]
181669,2019-01-01 03:52:32,"[[CLS], HIGHLIGHTS, On, the, options, front,, maximum, Put, open, interest, was, at, 10,500., Bank, Nifty, managed, to, hold, 27,000, and, witnessed, strong, momentum, Nifty, futures, closed, positive, at, 10,962, with, a, 0.42, per, cent, gain., [SEP]]","HIGHLIGHTS On the options front, maximum Put open interest was at 10,500. Bank Nifty managed to hold 27,000 and witnessed strong momentum Nifty futures closed positive at 10,962 with a 0.42 per cent gain.",1.0,[],[]


In [76]:
%timeit OLD_add_CT_dummies(READ_PATH_ROOT=TEST_PATH_ROOT, DF_SAVE_PATH_ROOT="0", BERT_INPUT_SAVE_PATH_ROOT="0", USER=USER)

20.5 s ± 153 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
# def OLD_add_CT_dummies(READ_PATH_ROOT, DF_SAVE_PATH_ROOT, BERT_INPUT_SAVE_PATH_ROOT, USER):
    
#     READ_PATH = READ_PATH_ROOT + USER + '_TWEETS.csv'
#     DF_SAVE_PATH = DF_SAVE_PATH_ROOT + USER + '_CLEANED_TWEETS.csv'
#     BERT_INPUT_SAVE_PATH = BERT_INPUT_SAVE_PATH_ROOT + USER + '_BERT_INPUT.csv'
    
#     df = pd.read_csv(READ_PATH, parse_dates=['date'], converters={'hashtags': eval})#.set_index('retweet')
    
#     if df['language'].sum() / len(df) < .1: #if less than 10% of tweets are recognized as English
    
#         list_of_non_english_users.append(USER)
    
#     else:
        
#         df.insert(loc=1, column='Cleaned Tweet', value=df['tweet'].apply(clean_tweet))
        
#         df = df[ df['Cleaned Tweet'].map(len) > 4 ] #only keep tweets with more than 4 words
        
#         del df['Unnamed: 0']

#     #     df['hashtags'] = df['hashtags'].apply(literal_eval)

#         # create dummies for each hashtag and link (in each tweet)    
#         for link in CT_link_list:
#             df[f'Link - {link}'] = df['urls'].str.contains(link, case=False).astype(int)

#         for tag in general_conspiracy_hashtags:
#             df[f'Tag - {tag}'] = df['hashtags'].apply(hashtag_in_list).astype(int)

#         for keyword in keywords:
#             df[f'Keyword - {keyword}'] = df['tweet'].str.contains(keyword, case=False).astype(int)

#         df.insert(loc=0, column='CT Tweet (Dummy)', value=(df[link_tag_keyword_cols].sum(axis=1) > 0).astype(int))

        

#         # somewhat unrelated - fix broken 'retweet' boolean column
#         df['retweet'] = df['tweet'].str.startswith('RT').astype(int)

#         return df

# #         df.to_csv(DF_SAVE_PATH, index=False)

# #         df[df['language']==1][['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'}).to_csv(BERT_INPUT_SAVE_PATH, index=False)

# #         return df[['Cleaned Tweet', 'CT Tweet (Dummy)']].rename(columns={'Cleaned Tweet':'sentence1', 'CT Tweet (Dummy)':'label'})

# Path Root to Create .CSVs for BERT Input

sentence1, label

In [140]:
BERT_CSV_folder_path = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/BERT/BERT Input CSVs/"

# Apply Final Function

In [144]:
# folder containing folders of all split user lookups
OG_READ_PATH_ROOT = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/All Conspiracy Tweeters' Tweets/"

# folder which will contain folders of all split user lookup AGGREGATIONS (weekly CT activity)
OG_WRITE_PATH_ROOT = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Looking Up All Conspiracy Hashtag User Tweets/Labeled OG Tweets/"

og_splits = [5,6,7,8,9]

for split in og_splits:
    local_read_path_root = OG_READ_PATH_ROOT + f'Split {split}/'
    local_write_path_root = OG_WRITE_PATH_ROOT + f'Split {split}/'
    
    # make new folders to store processed tweets
    if not os.path.exists(local_write_path_root):
        os.makedirs(local_write_path_root)
    
    # find all users who have been searched and stored in this folder
    users_in_this_split = [int(f.split('_')[0]) for f in listdir(local_read_path_root) if isfile(join(local_read_path_root, f))]

    # aggregate each user's tweets
    for user in users_in_this_split:
        print(user)
        add_CT_dummies(READ_PATH_ROOT=local_read_path_root, DF_SAVE_PATH_ROOT=local_write_path_root, BERT_INPUT_SAVE_PATH_ROOT=BERT_CSV_folder_path, USER=str(user))

1000027408508977152
1000104162292617221
1000269175401541632
1000372897
10003862
1000459433762279425
1000563162016432128
1000736389246472193
1000743191308484608


NameError: name 'list_of_non_english_USERs' is not defined