In [1]:
import numpy as np
import pandas as pd
import twint

import time
from datetime import datetime
from datetime import timedelta, date

from os import listdir
from os.path import isfile, join

from collections import Counter
from ast import literal_eval

import re

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import nest_asyncio
nest_asyncio.apply()

# PATHS

In [2]:
SAVE_PATH_ROOT = "C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Non-CT Training Data/Non-CT Tweets by Date Range/"    #folder where non-CT tweets from date ranges will be saved

# Functions

In [49]:
def clean_tweets(df):
    local_df = df
    
    #drop redundant cols
    local_df.drop(['created_at', 'timezone', 'cashtags', 'user_id_str', 'photos', 'video', 'thumbnail', 'translate', 'trans_src', 'trans_dest', 'name', 'search'], axis=1, inplace=True)
    
    #convert date to datetime
    local_df['date'] = pd.to_datetime(local_df['date'])
    
    return local_df

In [4]:
def find_tweets_in_date_range(search_terms, start_date, end_date, LOCAL_SAVE_PATH_ROOT):
    '''    
    search_terms = list of strings (or singleton) of search terms that will be in the tweet
    
    returns a pandas dataframe of tweets
    '''
    
    search_string = " OR ".join(search_terms)
    
    c = twint.Config()
    
#     date range of search
    c.Since = start_date
    c.Until = end_date
    
    c.Search = search_string
    
    # only collect a certain number of tweets
#     c.Limit = num_tweets

    # Don't print output
    c.Hide_output = True
    
    # find shadow-banned accounts too - this apparently slows things down considerably
    c.Profile_full = True

#     c.Output = f"{num_tweets} tweets - {start_date} to {end_date}.csv"
    c.Pandas = True

    twint.run.Search(c)

    clean_tweets(twint.storage.panda.Tweets_df).to_csv(fr"{LOCAL_SAVE_PATH_ROOT}{start_date} to {end_date}.csv")

## Create Date Range

In [5]:
date_ranges = []

sixdays = timedelta(days=6)

today_str = date.today().strftime('%m-%d-%Y')

for date in pd.date_range(start='2021-01-09', end=today_str, freq='1W'):
    date_tuple = ((date.strftime('%Y-%m-%d'), (date+sixdays).strftime('%Y-%m-%d') ))
    
    print(date_tuple)
    
    date_ranges.append(date_tuple)

('2021-01-10', '2021-01-16')
('2021-01-17', '2021-01-23')
('2021-01-24', '2021-01-30')
('2021-01-31', '2021-02-06')
('2021-02-07', '2021-02-13')
('2021-02-14', '2021-02-20')
('2021-02-21', '2021-02-27')
('2021-02-28', '2021-03-06')
('2021-03-07', '2021-03-13')
('2021-03-14', '2021-03-20')
('2021-03-21', '2021-03-27')
('2021-03-28', '2021-04-03')
('2021-04-04', '2021-04-10')
('2021-04-11', '2021-04-17')
('2021-04-18', '2021-04-24')
('2021-04-25', '2021-05-01')
('2021-05-02', '2021-05-08')
('2021-05-09', '2021-05-15')
('2021-05-16', '2021-05-22')
('2021-05-23', '2021-05-29')
('2021-05-30', '2021-06-05')
('2021-06-06', '2021-06-12')
('2021-06-13', '2021-06-19')
('2021-06-20', '2021-06-26')
('2021-06-27', '2021-07-03')
('2021-07-04', '2021-07-10')


# Find associated hashtags

## LIST OF NON-CT KEYWORDS/HASHTAGS

In [20]:
non_CT_words_and_tags = [
    'essentialworkers',
    'pfizerproud',
    'vaccineswork',
    'sciencewillwin'
]

non_CT_words_and_tags = non_CT_words_and_tags + [f'#{x}' for x in non_CT_words_and_tags]

# Apply Searching Function

In [36]:
for date_tup in date_ranges:
    print(date_tup)
    find_tweets_in_date_range(search_terms=non_CT_words_and_tags, start_date=date_tup[0], end_date=date_tup[1], LOCAL_SAVE_PATH_ROOT=SAVE_PATH_ROOT)

('2021-01-10', '2021-01-16')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-01-17', '2021-01-23')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-01-24', '2021-01-30')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-01-31', '2021-02-06')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-02-07', '2021-02-13')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-02-14', '2021-02-20')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-02-21', '2021-02-27')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-02-28', '2021-03-06')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-03-07', '2021-03-13')
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
('2021-03-

# Get All Users

In [42]:
non_CT_tweets_by_date_filenames = [f for f in listdir(SAVE_PATH_ROOT) if isfile(join(SAVE_PATH_ROOT, f))]

In [70]:
def get_hashtags_list(series):
    
    local_series = series.apply(literal_eval)
    
    local_series = local_series.explode()
    
    return local_series[pd.notna(local_series)]

In [71]:
user_ids = Counter({})
hashtags = Counter({})
num_tweets = 0

for filename in non_CT_tweets_by_date_filenames:
    try:
        full_path = fr'{SAVE_PATH_ROOT}{filename}'
        df = pd.read_csv(full_path)
        
        id_count = Counter(df['user_id'].value_counts().to_dict())
        user_ids = user_ids + id_count
        
        
        hashtag_list = get_hashtags_list(df['hashtags'])
        hashtag_count = Counter(hashtag_list.value_counts().to_dict())
        hashtags = hashtag_count + hashtags
        
        num_tweets += len(df)
        
        del df    
        
    except Exception as e:
        print(Exception)
        print(full_path)

In [73]:
ALL_NON_CT_USERS = pd.DataFrame.from_dict(user_ids, orient='index', columns=['Num NON-CT Tweets'])

ALL_NON_CT_USERS = ALL_NON_CT_USERS.reset_index()

ALL_NON_CT_USERS = ALL_NON_CT_USERS.rename(columns={'index': 'ID'})

ALL_NON_CT_USERS['Searched'] = 0

ALL_NON_CT_USERS.sort_values('Num NON-CT Tweets', ascending=False, inplace=True)

## Remove users who we flagged as CT users

### From Geolocated Master CT User List

In [104]:
MASTER_USER_PATH = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Geo Cross Referencing/Master User List - GEOLOCATED.csv"

ALL_CT_USERS = pd.read_csv(MASTER_USER_PATH).set_index('ID')

intersection_of_CT_and_NON_CT_users = np.intersect1d(ALL_CT_USERS.index, ALL_NON_CT_USERS['ID'])

ALL_CT_USERS['In CT & Non-CT Groups'] = 0

ALL_CT_USERS['In CT & Non-CT Groups'].loc[intersection_of_CT_and_NON_CT_users] = 1

ALL_CT_USERS.to_csv(MASTER_USER_PATH)

### From Non-CT User List (the one we just made in this notebook)

In [None]:
ALL_NON_CT_USERS.set_index('ID', inplace=True)

ALL_NON_CT_USERS['In CT & Non-CT Groups'] = 0

ALL_NON_CT_USERS['In CT & Non-CT Groups'].loc[intersection_of_CT_and_NON_CT_users] = 1

In [130]:
ALL_NON_CT_USERS.to_csv(r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Non-CT Training Data/Non-CT Users.csv")

# NEXT: Look Up All Tweets from Non-CT Users (if not in both datasets)

In [6]:
test = pd.read_csv(r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Non-CT Training Data/Non-CT Tweets by Date Range/2021-04-25 to 2021-05-01.csv")

In [50]:
def find_tweets_in_date_range(search_terms, start_date, end_date, LOCAL_SAVE_PATH_ROOT):
    '''    
    search_terms = list of strings (or singleton) of search terms that will be in the tweet
    
    returns a pandas dataframe of tweets
    '''
    
    search_string = " OR ".join(search_terms)
    
    c = twint.Config()
    
#     date range of search
    c.Since = start_date
    c.Until = end_date
    
    c.Search = search_string
    
    # only collect a certain number of tweets
#     c.Limit = num_tweets

    # Don't print output
    c.Hide_output = True
    
    # find shadow-banned accounts too - this apparently slows things down considerably
    c.Profile_full = True

#     c.Output = f"{num_tweets} tweets - {start_date} to {end_date}.csv"
    c.Pandas = True

    twint.run.Search(c)

    return clean_tweets(twint.storage.panda.Tweets_df)

#believescience


In [51]:
test_search_terms = ['#flattenthecurve', '#maskssavelives']

In [52]:
test = find_tweets_in_date_range(search_terms=test_search_terms, start_date='2020-01-01', end_date='2021-07-07', LOCAL_SAVE_PATH_ROOT=0)

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


In [53]:
test.to_csv(r"#flattenthecurve, #maskssavelives.csv", index=False)

# Clean Non-CT Tweets

## Remove Users Who Are Also Flagged for CT

In [54]:
def label_overlapped_users(df1, df2):
    '''
    Both DataFrames must have user ID (or something like that) as index
    '''
    
    intersection_of_users = np.intersect1d(df1.index, df2.index)
    
    df1['In CT and Non-CT DFs'] = 0
    df2['In CT and Non-CT DFs'] = 0
    
    df1['In CT and Non-CT DFs'].loc[intersection_of_users] = 1
    df2['In CT and Non-CT DFs'].loc[intersection_of_users] = 1

    return df1, df2
    

In [55]:
MASTER_USER_PATH = r"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Geo Cross Referencing/Master User List - GEOLOCATED.csv"

ALL_CT_USERS = pd.read_csv(MASTER_USER_PATH).set_index('ID')

In [56]:
flattenthecurve = pd.read_csv(r"#flattenthecurve, #maskssavelives.csv")

flattenthecurve = flattenthecurve.set_index('id')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [57]:
flattenthecurve, ALL_CT_USERS = label_overlapped_users(flattenthecurve, ALL_CT_USERS)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [58]:
flattenthecurve.head(2)

Unnamed: 0_level_0,conversation_id,date,place,tweet,language,hashtags,user_id,username,day,hour,link,urls,retweet,nlikes,nreplies,nretweets,quote_url,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,In CT and Non-CT DFs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1412197378707267585,1412197378707267585,2021-07-05 16:51:07,,#COVID19_Kitakyushucity 新規感染者数　6人 #StayHome #StaySafe #FlattenTheCurve #BeatCovid19 #StopCovid19,ja,"['covid19_kitakyushucity', 'stayhome', 'staysafe', 'flattenthecurve', 'beatcovid19', 'stopcovid19']",267059579,boyackey_badmen,1,16,https://twitter.com/boyackey_badmen/status/1412197378707267585,[],False,0,0,0,https://twitter.com/tokki_kitaq/status/1412017867910828042,,,,,,,[],,0
1412182760186421251,1412182760186421251,2021-07-05 15:53:01,,#UprootTheSystem #nomoreemptypromises #climatestrike #FridaysForFuture #schoolstrike4climate #workstrike4climate #flattenthecurve #FaceTheClimateEmergency #FightClimateInjustice https://t.co/z...,und,"['uprootthesystem', 'nomoreemptypromises', 'climatestrike', 'fridaysforfuture', 'schoolstrike4climate', 'workstrike4climate', 'flattenthecurve', 'facetheclimateemergency', 'fightclimateinjustice']",88551890,EricBourgouin,1,15,https://twitter.com/EricBourgouin/status/1412182760186421251,[],False,5,0,3,,,,,,,,[],,0


## Clean and Label Tweets

### Keyword lists and such

In [8]:
general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
#     'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

keywords = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'covid hoax',
    'covid1984',
    'plandemia',
    'new world order',
    'wake up america',
    'open america now',
    'fire fauci',
    'wwg1wga',
    'qanon',
    'coronahoax',
    'corona hoax',
]

CT_link_list = ['zerohedge.com', 'infowars.com', 'principia-scientific.com',
'tx.voice-truth.com', 'humansarefree.com', 'activistpost.com'
'gnews.org', 'wakingtimes.com', 'brighteon.com','thewallwillfall.org','sott.net',]


hashtag_set = set(['#' + tag for tag in general_conspiracy_hashtags])
keyword_set = set(keywords)

re_escape_keywords = '|'.join([re.escape(word) for word in keywords])
re_escape_links = '|'.join([re.escape(link) for link in CT_link_list])

## Define Functions for Cleaning

In [2]:
def hashtag_in_list(list_of_hashtags_in_tweet):
    return any(hashtag.upper() in [tag.upper() for tag in list_of_hashtags_in_tweet] for hashtag in general_conspiracy_hashtags)

In [3]:
def clean_tweet(tweet):
    '''
    input = tweet (str)
    output = cleaned_tweet(str)
    '''
    
    return [['CLS']] + [x.replace('#','') for x in tweet.split() if not (x.startswith(('http','@')) or x in keyword_set or x in hashtag_set)] + [['SEP']]

In [4]:
def label_CT_tweets(df):
    
    return ( df['tweet'].str.contains(re_escape_keywords, case=False) | df['urls'].str.contains(re_escape_links, case=False) | df['hashtags'].apply(hashtag_in_list)).astype(int)


In [64]:
flattenthecurve['CT Tweet'] = label_CT_tweets(flattenthecurve)

In [70]:
flattenthecurve = flattenthecurve[flattenthecurve['language']=='en']

In [72]:
flattenthecurve.to_csv(r"Semi-Cleaned #flattenthecurve, #maskssavelives.csv")

In [5]:
flattenthecurve = pd.read_csv(r"Semi-Cleaned #flattenthecurve, #maskssavelives.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Final Cleaning

In [9]:
flattenthecurve['Cleaned Tweet'] = flattenthecurve['tweet'].apply(clean_tweet)

In [20]:
flattenthecurve[(flattenthecurve['Cleaned Tweet'].map(len) > 4)][['Cleaned Tweet', 'CT Tweet']].to_csv(r'#flattenthecurve, #maskssavelives - CLEANED FOR BERT.csv')

In [18]:
(flattenthecurve['Cleaned Tweet'].map(len) > 4).sum() / len(flattenthecurve)

0.9924777969562723