In [1]:
import pandas as pd
import numpy as np
import time

from os import listdir
from os.path import isfile, join

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97.5% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Objective of this file:

1. Count total *CT* tweets per day per user
    * Hashtags
    * Links
2. Count total tweets per day per user
3. Use (1) and (2) to get CT tweets per day

In [3]:
# File containing list of users and their locations
USER_LOC_PATH = r"C:/Users/mikha/OneDrive/Desktop/Dropbox/MIKHAEL NEW/mikhael school/Grad School/Master's/594/Data/Twint Data/2270718426_TWEETS.csv"


## Hashtag and Link Lists

In [18]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
    'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

# CHANGE THESE LATER
CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com'
'gnews.org', 'wakingtimes.com', 'brighteon.com','thewallwillfall.org','sott.net',]


## Columns to aggregate

In [None]:
link_tag_cols = ['date',
 'Link - zerohedge.com - POST',
 'Link - zerohedge.com - RETWEET',
 'Link - zerohedge.com - TOTAL',
 'Link - infowars.com - POST',
 'Link - infowars.com - RETWEET',
 'Link - infowars.com - TOTAL',
 'Link - principia-scientific.com - POST',
 'Link - principia-scientific.com - RETWEET',
 'Link - principia-scientific.com - TOTAL',
 'Link - tx.voice-truth.com - POST',
 'Link - tx.voice-truth.com - RETWEET',
 'Link - tx.voice-truth.com - TOTAL',
 'Link - humansarefree.com - POST',
 'Link - humansarefree.com - RETWEET',
 'Link - humansarefree.com - TOTAL',
 'Link - activistpost.comgnews.org - POST',
 'Link - activistpost.comgnews.org - RETWEET',
 'Link - activistpost.comgnews.org - TOTAL',
 'Link - wakingtimes.com - POST',
 'Link - wakingtimes.com - RETWEET',
 'Link - wakingtimes.com - TOTAL',
 'Link - brighteon.com - POST',
 'Link - brighteon.com - RETWEET',
 'Link - brighteon.com - TOTAL',
 'Tag - plandemic - POST',
 'Tag - plandemic - RETWEET',
 'Tag - plandemic - TOTAL',
 'Tag - scamdemic - POST',
 'Tag - scamdemic - RETWEET',
 'Tag - scamdemic - TOTAL',
 'Tag - covidhoax - POST',
 'Tag - covidhoax - RETWEET',
 'Tag - covidhoax - TOTAL',
 'Tag - nwo - POST',
 'Tag - nwo - RETWEET',
 'Tag - nwo - TOTAL',
 'Tag - covid1984 - POST',
 'Tag - covid1984 - RETWEET',
 'Tag - covid1984 - TOTAL',
 'Tag - plandemia - POST',
 'Tag - plandemia - RETWEET',
 'Tag - plandemia - TOTAL',
 'Tag - agenda21 - POST',
 'Tag - agenda21 - RETWEET',
 'Tag - agenda21 - TOTAL',
 'Tag - thegreatreset - POST',
 'Tag - thegreatreset - RETWEET',
 'Tag - thegreatreset - TOTAL',
 'Tag - agenda2030 - POST',
 'Tag - agenda2030 - RETWEET',
 'Tag - agenda2030 - TOTAL',
 'Tag - newworldorder - POST',
 'Tag - newworldorder - RETWEET',
 'Tag - newworldorder - TOTAL',
 'Tag - wakeupamerica - POST',
 'Tag - wakeupamerica - RETWEET',
 'Tag - wakeupamerica - TOTAL',
 'Tag - wakeup - POST',
 'Tag - wakeup - RETWEET',
 'Tag - wakeup - TOTAL',
 'Tag - openamericanow - POST',
 'Tag - openamericanow - RETWEET',
 'Tag - openamericanow - TOTAL',
 'Tag - firefauci - POST',
 'Tag - firefauci - RETWEET',
 'Tag - firefauci - TOTAL',
 'Tag - wwg1wga - POST',
 'Tag - wwg1wga - RETWEET',
 'Tag - wwg1wga - TOTAL',
 'Tag - qanon - POST',
 'Tag - qanon - RETWEET',
 'Tag - qanon - TOTAL',
 'Tag - coronahoax - POST',
 'Tag - coronahoax - RETWEET',
 'Tag - coronahoax - TOTAL']

## Column subsets (done here - outside of the fnc. - to save time)

In [32]:
hashtagged_post_cols = [x for x in link_tag_cols if x.startswith('Tag') and x.endswith('POST')]
hashtagged_retweet_cols = [x for x in link_tag_cols if x.startswith('Tag') and x.endswith('RETWEET')]

link_post_cols = [x for x in link_tag_cols if x.startswith('Link') and x.endswith('POST')]
link_retweet_cols = [x for x in link_tag_cols if x.startswith('Link') and x.endswith('RETWEET')]

front_cols = [ 'TOTAL CT Activity', 'TOTAL Tags', 'TOTAL Post Tags', 'TOTAL Retweet Tags', 'TOTAL Links', 'TOTAL Post Links', 'TOTAL Retweet Links']
back_cols = [col for col in link_tag_cols if col not in front_cols and col != 'date']

# Final Function

In [54]:
def agg_by_week(READ_PATH, SAVE_PATH):
    
    df = pd.read_csv(READ_PATH, parse_dates=['date'])#.set_index('retweet')
        
    # create dummies for each hashtag and link (in each tweet)    
    for link in CT_link_list:
        df[f'Link - {link} - POST'] = ((df['urls'].str.contains(link, case=False)) & (df['retweet']==False)).astype(int)
        df[f'Link - {link} - RETWEET'] = ((df['urls'].str.contains(link, case=False)) & (df['retweet']==True)).astype(int)
        df[f'Link - {link} - TOTAL'] = df[f'Link - {link} - POST'] + df[f'Link - {link} - RETWEET']
    
    for tag in general_conspiracy_hashtags:
        df[f'Tag - {tag} - POST'] = ((df['hashtags'].str.contains(tag, case=False)) & (df['retweet']==False)).astype(int)
        df[f'Tag - {tag} - RETWEET'] = ((df['hashtags'].str.contains(tag, case=False)) & (df['retweet']==True)).astype(int)
        df[f'Tag - {tag} - TOTAL'] = df[f'Tag - {tag} - POST'] + df[f'Tag - {tag} - RETWEET']
    
    # aggregate by weekly time period
    df = df[link_tag_cols].groupby(pd.Grouper(key='date', freq='1W')).sum()
    
    # create totalling columns
    df['TOTAL Post Tags'] = df[hashtagged_post_cols].sum(axis=1)
    df['TOTAL Retweet Tags'] = df[hashtagged_retweet_cols].sum(axis=1)
    df['TOTAL Tags'] = df['TOTAL Post Tags'] + df['TOTAL Retweet Tags']
    
    df['TOTAL Post Links'] = df[link_post_cols].sum(axis=1)
    df['TOTAL Retweet Links'] = df[link_retweet_cols].sum(axis=1)
    df['TOTAL Links'] = df['TOTAL Post Links'] + df['TOTAL Retweet Links']
    
    df['TOTAL CT Activity'] = df['TOTAL Links'] + df['TOTAL Tags']
    
    # re-order columns to have more important ones at the beginning (not important but nice to do)
    df = df[front_cols + back_cols]
    
    df.to_csv(SAVE_PATH, index=True)

# Apply Final Function

In [None]:
# folder containing folders of all split user lookups
OG_READ_PATH_ROOT = r"C:\Users\crackcocaine69xxx\Python Stuff\594\Twint\Looking Up All Conspiracy Hashtag User Tweets\All Conspiracy Tweeters' Tweets"

# folder which will contain folders of all split user lookup AGGREGATIONS (weekly CT activity)
OG_WRITE_PATH_ROOT = r"C:\Users\crackcocaine69xxx\Python Stuff\594\Twint\Looking Up All Conspiracy Hashtag User Tweets\OG Processed Tweets (naive counting CT activity)"

og_splits = [5,6,7,8,9]

for split in og_splits:
    local_read_path_root = OG_READ_PATH_ROOT + f'Split {split}/'
    local_write_path_root = OG_WRITE_PATH_ROOT + f'Split {split}/'
    
    # make new folders to store processed tweets
    if not os.path.exists(local_write_path_root):
        os.makedirs(local_write_path_root)
    
    # find all users who have been searched and stored in this folder
    users_in_this_split = [int(f.split('_')[0]) for f in listdir(local_read_path_root) if isfile(join(local_read_path_root, f))]

    # aggregate each user's tweets
    for user in users_in_this_split:
        agg_by_week(READ_PATH=local_read_path_root+f'{user}_TWEETS.csv', SAVE_PATH=local_write_path_root+f'{user}_AGGREGATED.csv')