In [1]:
# pip install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [2]:
import numpy as np
import pandas as pd
import twint

import time
from datetime import datetime
from datetime import timedelta, date

from os import listdir
from os.path import isfile, join

from collections import Counter
from ast import literal_eval

import re

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import nest_asyncio
nest_asyncio.apply()

In [3]:
def clean_tweets(df):
    local_df = df
    
    #drop redundant cols
    local_df.drop(['created_at', 'timezone', 'cashtags', 'user_id_str', 'photos', 'video', 'thumbnail', 'translate', 'trans_src', 'trans_dest', 'name', 'search'], axis=1, inplace=True)
    
    #convert date to datetime
    local_df['date'] = pd.to_datetime(local_df['date'])
    
    return local_df

In [4]:
def find_tweets_in_date_range(search_terms, start_date, end_date, LOCAL_SAVE_PATH_ROOT):
    '''    
    search_terms = list of strings (or singleton) of search terms that will be in the tweet
    
    returns a pandas dataframe of tweets
    '''
    
    search_string = " OR ".join(search_terms)
    
    c = twint.Config()
    
#     date range of search
    c.Since = start_date
    c.Until = end_date
    
    c.Search = search_string
    
    # only collect a certain number of tweets
#     c.Limit = num_tweets

    # Don't print output
    c.Hide_output = True
    
    # find shadow-banned accounts too - this apparently slows things down considerably
    c.Profile_full = True

#     c.Output = f"{num_tweets} tweets - {start_date} to {end_date}.csv"
    c.Pandas = True

    twint.run.Search(c)

    return clean_tweets(twint.storage.panda.Tweets_df)

#believescience


In [5]:
non_CT_words_and_tags = [
    'essentialworkers',
    'pfizerproud',
    'vaccineswork',
    'sciencewillwin'
]

non_CT_words_and_tags = non_CT_words_and_tags + [f'#{x}' for x in non_CT_words_and_tags]

test_search_terms = non_CT_words_and_tags

In [6]:
test = find_tweets_in_date_range(search_terms=test_search_terms, start_date='2020-01-01', end_date='2021-07-07', LOCAL_SAVE_PATH_ROOT=0)

RefreshTokenException: Could not find the Guest token in HTML

In [None]:
test.to_csv(r"#essentialworkers, #pfizerproud, #vaccineswork, #sciencewillwin.csv", index=False)

# Clean Non-CT Tweets

## Remove Users Who Are Also Flagged for CT

In [None]:
def label_overlapped_users(df1, df2):
    '''
    Both DataFrames must have user ID (or something like that) as index
    '''
    
    intersection_of_users = np.intersect1d(df1.index, df2.index)
    
    df1['In CT and Non-CT DFs'] = 0
    df2['In CT and Non-CT DFs'] = 0
    
    df1['In CT and Non-CT DFs'].loc[intersection_of_users] = 1
    df2['In CT and Non-CT DFs'].loc[intersection_of_users] = 1

    return df1, df2
    

In [None]:
MASTER_USER_PATH = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Geo Cross Referencing/Master User List - GEOLOCATED.csv"

ALL_CT_USERS = pd.read_csv(MASTER_USER_PATH).set_index('ID')

In [None]:
non_CT_tweets_2 = pd.read_csv(r"#essentialworkers, #pfizerproud, #vaccineswork, #sciencewillwin.csv")

non_CT_tweets_2 = non_CT_tweets_2.set_index('id')

In [None]:
non_CT_tweets_2, ALL_CT_USERS = label_overlapped_users(non_CT_tweets_2, ALL_CT_USERS)

In [None]:
non_CT_tweets_2.head(2)

## Clean and Label Tweets

### Keyword lists and such

In [None]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
#     'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'new world order',
    'wake up america',
    'open america now',
    'fire fauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    'corona hoax',
]

CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com'
'gnews.org', 'wakingtimes.com', 'brighteon.com','thewallwillfall.org','sott.net',]


hashtag_set = set(['#' + tag for tag in general_conspiracy_hashtags])
keyword_set = set(keywords)

re_escape_keywords = '|'.join([re.escape(word) for word in keywords])
re_escape_links = '|'.join([re.escape(link) for link in CT_link_list])

## Define Functions for Cleaning

In [None]:
def hashtag_in_list(list_of_hashtags_in_tweet):
    return any(hashtag.upper() in [tag.upper() for tag in list_of_hashtags_in_tweet] for hashtag in general_conspiracy_hashtags)

In [None]:
def clean_tweet(tweet):
    '''
    input = tweet (str)
    output = cleaned_tweet(str)
    '''
    
#     return [['CLS']] + [x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)] + [['SEP']]
    return ' '.join([x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)])

In [None]:
def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].apply(hashtag_in_list)).astype(int)


In [None]:
non_CT_tweets_2['CT Tweet'] = label_CT_tweets(non_CT_tweets_2)

In [None]:
non_CT_tweets_2 = non_CT_tweets_2[non_CT_tweets_2['language']=='en']

In [None]:
non_CT_tweets_2.to_csv(r"Semi-Cleaned #essentialworkers, #pfizerproud, #vaccineswork, #sciencewillwin.csv")

In [None]:
non_CT_tweets_2 = pd.read_csv(r"Semi-Cleaned #essentialworkers, #pfizerproud, #vaccineswork, #sciencewillwin.csv")

# Final Cleaning

In [None]:
non_CT_tweets_2['Cleaned Tweet'] = non_CT_tweets_2['tweet'].apply(clean_tweet)

In [None]:
non_CT_tweets_2[(non_CT_tweets_2['Cleaned Tweet'].map(len) > 4)][['Cleaned Tweet', 'CT Tweet']].to_csv(r'#essentialworkers, #pfizerproud, #vaccineswork, #sciencewillwin - CLEANED FOR BERT.csv')

In [None]:
(non_CT_tweets_2['Cleaned Tweet'].str.split().map(len) > 4).sum() / len(non_CT_tweets_2)