In [1]:
# conda install git

In [2]:
# pip install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [3]:
import numpy as np
import pandas as pd
import twint
import time
from datetime import datetime

In [4]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [5]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
import nltk
from nltk.tokenize import ToktokTokenizer
toktok = ToktokTokenizer()
from nltk.corpus import stopwords
# lemma = nltk.wordnet.WordNetLemmatizer()

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\crackcocaine69xxx\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stopwords = set(stopwords.words("english"))
stopwords -= {'who'}
stopwords.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 

In [9]:
def remove_junk_from_str(string):
    return " ".join([x for x in toktok.tokenize(string) if x.lower() not in stopwords and x[0]!='#' and x[:3]!='htt'])

vec_remove_junk = np.vectorize(remove_junk_from_str)

In [10]:
def clean_tweets(df):
    local_df = df
    
    #drop redundant cols
    local_df.drop(['created_at', 'timezone', 'language', 'cashtags', 'user_id_str', 'urls', 'photos', 'video', 'thumbnail', 'translate', 'trans_src', 'trans_dest', 'name', 'search'], axis=1, inplace=True)
    
    #convert date to datetime
    local_df['date'] = pd.to_datetime(local_df['date'])
    
    #REMOVE STOPWORDS FROM TWEET HERE
    local_df['tweet'] = list(map(vec_remove_junk, local_df['tweet']))
    
    return local_df


## Collect Tweets hashtags

## Fetch tweets within a date range

In [19]:
def find_tweets_in_date_range(search_terms, start_date, end_date):
    '''    
    search_terms = list of strings (or singleton) of search terms that will be in the tweet
    
    returns a pandas dataframe of tweets
    '''
    
    search_string = " OR ".join(search_terms)
    
    c = twint.Config()
    
#     date range of search
    c.Since = start_date
    c.Until = end_date
    
    c.Search = search_string
    
    # only collect a certain number of tweets
#     c.Limit = num_tweets

    # Don't print output
    c.Hide_output = True
    
    # find shadow-banned accounts too - this apparently slows things down considerably
    c.Profile_full = True

#     c.Output = f"{num_tweets} tweets - {start_date} to {end_date}.csv"
    c.Pandas = True

    twint.run.Search(c)

#     tweets_df = twint.storage.panda.Tweets_df
    
#     return clean_tweets(tweets_df)

    clean_tweets(twint.storage.panda.Tweets_df).to_csv(fr"C:/Users/crackcocaine69xxx/Python Stuff/594/Twint/Twint Output{start_date} to {end_date}.csv")

### Fetch tweets up to a number limit

In [12]:
def find_tweets_by_limit(num_tweets, search_terms):#, start_date, end_date):
    '''    
    search_terms = list of strings (or singleton) of search terms that will be in the tweet
    
    returns a pandas dataframe
    '''
    
    search_string = " OR ".join(search_terms)
    
    c = twint.Config()
    
    # date range of search
#     c.Since = start_date
#     c.Until = end_date
    
    c.Search = search_string
    
    # only collect a certain number of tweets
    c.Limit = num_tweets

    # Don't print output
    c.Hide_output = True
    
    # find shadow-banned accounts too - this apparently slows things down considerably
    c.Profile_full = True

#     c.Output = f"{num_tweets} tweets - {start_date} to {end_date}.csv"
    c.Pandas = True

    twint.run.Search(c)

    tweets_df = twint.storage.panda.Tweets_df
    
    return clean_tweets(tweets_df)

## Find most common associated hashtags

In [13]:
def find_associated_hashtags(series):
    '''
    series = pd series where each observation is a python list
    '''
    hashtag_set = set([tag for list_of_tags in series for tag in list_of_tags])
    
    
    count_dict = dict.fromkeys(hashtag_set, 0)
    for list_of_tags in series:
        for tag in list_of_tags:
            count_dict[tag] += 1

    count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))

    
    return(count_dict)


## List of Hashtags to Search

In [14]:
# conspiracy_hashtags is a list of hashtags that I think are used by people who don't believe that covid is real (or who belive that it's overblown).
# Note that there are conspiracy theorists who believe it does exist but was intentionally planted by china etc. these are not the conspiracy theorists I want!!!

#SHOULD I TRY TO SEPARATE THE TWO GROUPS? THEY MIGHT NOT EVEN BE SEPARATE???

general_conspiracy_hashtags = [
    'plandemic',
    'scamdemic',
    'covidhoax',
    'nwo',
    'covid1984',
    'plandemia',
    'agenda21',
    'thegreatreset',
    'agenda2030',
    'newworldorder',
    'wakeupamerica',
    'wakeup',
    'openamericanow',
    'firefauci',
    'wwg1wga',
    'qanon',
    'coronahoax'
]

general_conspiracy_hashtags = general_conspiracy_hashtags + [f'#{x}' for x in general_conspiracy_hashtags]

# covid_denier_hashtags = [
#     'plandemic',
#     'scamdemic',
#     'covidhoax',    
# ]

## Loop to find all tweets

### Create Date Ranges

In [15]:
from datetime import timedelta, date

In [16]:
week = timedelta(weeks=1)
sixdays = timedelta(days=6)

In [17]:
date_ranges = []

today_str = date.today().strftime('%d-%m-%Y')

for date in pd.date_range(start='1-1-2020', end=today_str, freq='1W'):
    date_tuple = ((date.strftime('%Y-%m-%d'), (date+sixdays).strftime('%Y-%m-%d') ))
    
    print(date_tuple, '\n')
    
    date_ranges.append(date_tuple)
    
# date_ranges    

('2020-01-05', '2020-01-11') 

('2020-01-12', '2020-01-18') 

('2020-01-19', '2020-01-25') 

('2020-01-26', '2020-02-01') 

('2020-02-02', '2020-02-08') 

('2020-02-09', '2020-02-15') 

('2020-02-16', '2020-02-22') 

('2020-02-23', '2020-02-29') 

('2020-03-01', '2020-03-07') 

('2020-03-08', '2020-03-14') 

('2020-03-15', '2020-03-21') 

('2020-03-22', '2020-03-28') 

('2020-03-29', '2020-04-04') 

('2020-04-05', '2020-04-11') 

('2020-04-12', '2020-04-18') 

('2020-04-19', '2020-04-25') 

('2020-04-26', '2020-05-02') 

('2020-05-03', '2020-05-09') 

('2020-05-10', '2020-05-16') 

('2020-05-17', '2020-05-23') 

('2020-05-24', '2020-05-30') 

('2020-05-31', '2020-06-06') 

('2020-06-07', '2020-06-13') 

('2020-06-14', '2020-06-20') 

('2020-06-21', '2020-06-27') 

('2020-06-28', '2020-07-04') 

('2020-07-05', '2020-07-11') 

('2020-07-12', '2020-07-18') 

('2020-07-19', '2020-07-25') 

('2020-07-26', '2020-08-01') 

('2020-08-02', '2020-08-08') 

('2020-08-09', '2020-08-15') 

('2020-0

## Final Loop

In [None]:
for date_tuple in date_ranges:
    find_tweets_in_date_range(general_conspiracy_hashtags, date_tuple[0], date_tuple[1])

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


In [None]:
find_tweets_in_date_range(general_conspiracy_hashtags, '2021-05-23', '2021-05-29')

In [155]:
mask_tweets_100k = find_tweets_by_limit(100000, ['mask'])

## Trash

### Make "general_conspiracy_400k"

In [166]:
general_conspiracy_400k = find_tweets_by_limit(400000, general_conspiracy_hashtags)

In [172]:
general_conspiracy_400k.to_csv(r"400k Conspiracy Tweets.csv", index=False)

In [167]:
find_associated_hashtags(general_conspiracy_400k['hashtags'])

{'wakeup': 85607,
 'agenda2030': 53037,
 'plandemia': 51217,
 'thegreatreset': 47530,
 'nwo': 43913,
 'wakeupamerica': 37713,
 'plandemic': 28670,
 'scamdemic': 27545,
 'covid19': 26953,
 'covid1984': 26680,
 'newworldorder': 17401,
 'agenda21': 13270,
 'greatreset': 11285,
 'covid': 10442,
 'immusic': 9620,
 'buildbackbetter': 9353,
 'covidhoax': 8718,
 'ods': 6956,
 'wef': 6822,
 'coronavirus': 6340,
 'truth': 6018,
 'maga': 4915,
 'lockdown': 4546,
 'freedom': 3967,
 'vaccine': 3857,
 'covid_19': 3760,
 'yonomevacuno': 3479,
 'klausschwab': 3275,
 'sdgs': 3116,
 'coronatimo': 3103,
 'votefraud': 2940,
 'america': 2874,
 'trump': 2866,
 'noalbozal': 2710,
 'usa': 2680,
 'billgates': 2447,
 'news': 2282,
 'crimesagainsthumanity': 2249,
 'corona': 2239,
 'enoughisenough': 2220,
 'covidvaccine': 2216,
 'noalavacuna': 2162,
 'elections': 2081,
 'china': 2071,
 'electionresults2020': 2057,
 'thefirstontv': 2037,
 'plandemie': 1999,
 'genocidio': 1995,
 'biden': 1992,
 'prepper': 1978,
 'l

### Find tweets associated with fauci

In [None]:
find_tweets_by_limit(100, ['fauci, #firefauci'])

In [190]:
thegreatawakening_10k = find_tweets_by_limit(10000, ['#thegreatawakening'])

In [191]:
find_associated_hashtags(thegreatawakening_10k['hashtags'])

{'thegreatawakening': 10011,
 'wwg1wga': 1679,
 'maga': 1010,
 'thegreatreset': 819,
 'godwins': 677,
 'qanon': 626,
 'deepstate': 559,
 'kag': 533,
 'wakeupamerica': 488,
 'trump2020': 487,
 'tyranny': 463,
 'covid19': 415,
 'fightback': 411,
 'trump': 341,
 'stopthecoup': 340,
 'america': 335,
 'wearethenewsnow': 332,
 'darktolight': 320,
 'savethechildren': 305,
 'q': 302,
 'fakenews': 291,
 'thestorm': 288,
 'china': 275,
 'saveamerica': 272,
 'freedom': 271,
 'obamagate': 268,
 'tryants': 253,
 'greatawakening': 252,
 'vaccines': 248,
 'ccpvirus': 246,
 'uprising': 246,
 'lockdown': 243,
 'pushback': 237,
 'virus': 236,
 'draintheswamp': 235,
 'sheepnomore': 235,
 'wakeup': 227,
 'civildisobedience': 222,
 'treason': 222,
 'truth': 208,
 'kolr10news': 206,
 'ky3': 205,
 'kolr10': 200,
 'stopthesteal': 194,
 'ozarks': 193,
 'ky3news': 193,
 'ozarkmo': 191,
 'ksprnews': 190,
 'shadowgate': 189,
 'kspr': 189,
 'ozarksfox': 186,
 'thegreatawakeningworldwide': 182,
 'wethepeople': 179,

In [187]:
find_associated_hashtags(fauci_100k['hashtags'])

{'firefauci': 90992,
 'covid19': 3590,
 'coronavirus': 2623,
 'firebirx': 2143,
 'openamericanow': 1772,
 'fauci': 1712,
 'obamagate': 1196,
 'plandemic': 1140,
 'faucifraud': 1085,
 'billgates': 1083,
 'firefaucinow': 1078,
 'faucithefraud': 906,
 'trump2020': 850,
 'maga': 817,
 'trump': 800,
 'firetrump': 759,
 'covid': 725,
 'fakenews': 706,
 'covid_19': 678,
 'cdc': 627,
 'who': 608,
 'arrestbillgates': 607,
 'fireredfield': 584,
 'wwg1wga': 557,
 'coronahoax': 552,
 'vaccines': 540,
 'hydroxychloroquine': 538,
 'reopenamerica': 531,
 'draintheswamp': 519,
 'fakepandemic': 495,
 'foxnews': 449,
 'endthelockdown': 441,
 'lockdown': 431,
 'scamdemic': 420,
 'billgatesisevil': 413,
 'crimesagainsthumanity': 399,
 'vaccine': 398,
 'jailgates': 379,
 'arrestfauci': 359,
 'deepstate': 355,
 'donaldtrump': 355,
 'covidiots': 354,
 'masksoff': 354,
 'qanon': 353,
 'anthonyfauci': 351,
 'nomasks': 349,
 'drfauci': 348,
 'covidhoax': 335,
 'covidー19': 322,
 'kag': 316,
 'thegreatawakening':