In [82]:
import pandas as pd
import numpy as np
import os
import time

In [3]:
# File containing list of users and their locations
USER_LOC_PATH = r"C:/Users/lukea/Documents/GitHub/Twitter-Conspiracies/Pulling Twitter Data/All User IDs.csv"

# Top level folder containing tweets by user
TWEETS_PATH = r"G:/Twitter-Conspiracies/user-tweets"

# CHANGE THIS to reflect the splits you are using. Note that range(0,5) gives 0,1,2,3,4
splits = range(0,5)

In [67]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
    'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]
nan_value = float("NaN")

In [308]:
def process_hashtags(df):
    '''
    This removes extra characters from the hashtag column and splits multiple hashtags into list items
    ---
    df: csv file pulled from Twint
    '''
    df_new = df.copy(deep=True)
    df_clean = pd.DataFrame(row.replace("'",'').replace("[",'').replace("]",'').replace("]",'').replace(" ",'') for row in df_new['hashtags'])
    df_new['hashtags'] = df_clean
    df_split = split_mults(df_new)
    return df_split

In [307]:
def split_mults(df):
    '''
    Splits multiple hashtags into list items
    ---
    df: csv file pulled from Twint
    '''
    
    df_new = df.copy(deep=True)
    df_split = [row.split(",") for row in df_new['hashtags']]
    df_new['hashtags'] = df_split
    
    return df_new

In [304]:
def drop_non_CT_hashtags(df,hashlist):
    '''
    Generates a new dataframe with only the rows that include tweets from our list of CT hashtags
    ---
    df: csv file pulled from Twint including all of a user's tweets
    hashlist: list of CT hashtags
    '''
    
    df = process_hashtags(df)
    df_new = df.copy(deep=True).drop(df.index) #empty df to be populated (done this way so that indices are correct)
    
    # ~NOTE~ I think this part is quite slow. It would be faster to do this with list comprehension.
    for row in range(0,len(df['hashtags'])):
        if any(item in df['hashtags'][row] for item in hashlist):
            df_new = df_new.append(pd.DataFrame(df.loc[row]).transpose())
            
    return df_new

In [127]:
def get_daily_count(df):
    '''
    Generates a dataframe with the number of tweets on a given day from a user. 
    Can be used with either the full dataframe of tweets, or the dataframe of CT tweets generated by drop_non_CT_hashtags()
    ---
    df: csv file pulled from Twint including some or all of a user's tweets
    '''
    
    df['day'] = pd.DataFrame(row.split(' ')[0] for row in df['date']) # Drop time information
    dates = pd.DataFrame(df['day']).drop_duplicates() # get unique dates
    dates['tweet_count'] = 0
    
    for day in dates['day']:
        dates.loc[dates['day']==day, 'tweet_count'] = len(df.loc[df['day'] == day]) # set tweet count for each unique date according to total number of tweets from that day
    
    return dates

The following chunk measures processing time for a single user

In [353]:
USER_PATH = r'G:/Twitter-Conspiracies/user-tweets/2/14125938_TWEETS.csv'

start = time.perf_counter()

user_tweets = pd.read_csv(USER_PATH)
all_tweet_count = get_daily_count(user_tweets)
CT_tweet_count = get_daily_count(drop_non_CT_hashtags(user_tweets,general_conspiracy_hashtags).reset_index())

stop = time.perf_counter()

stop - start

-7.84382110000297

The above result indicates that we should probably improve efficiency before actually running this. At nearly 8 seconds per user, we'll only get through about 10k per day. This is most likely a CPU bottleneck, not a network thing, so running parallel won't help as much.

The likely culprits are:
1. looping through each row to check the hashtags in drop_non_CT_tweets()
    - Can improve this somewhat by dropping tweets with no hashtags at all before we loop, if nothing else
2. sloppy code that copies the dataframe several times more than is probably necessary
    - Need to go through and figure out where this can be pared down

The next step is to create a dataframe to hold all of the tweet count data, and then to loop through all users and add them to it.

Finally, we need to aggregate users by county.

In [None]:
# skeleton of loop code
'''
for i in splits:
    SUB_PATH = TWEETS_PATH + "/" + str(i)
    
    for filename in os.listdir(SUB_PATH):
        user_tweets = pd.read_csv(SUB_PATH + filename)
        all_tweet_count = get_daily_count(user_tweets)
        CT_tweet_count = get_daily_count(drop_non_CT_hashtags(split_mults(remove_extra_chars(user_tweets)), general_conspiracy_hashtags))
        
'''