In [4]:
from Script_Processing.tweet_collection_wrapper import TweetCollector
from Script_Processing.tweet_io import TweetDataConverter
import datetime
import json
import itertools

## Build Queries

In [5]:
covid_anchor = ["(covid OR coronavirus OR covid19)"]

keywords = ["(hydroxychloroquine OR plaquenil)"] ## fill in here

In [6]:
queries = []
for words in itertools.product(covid_anchor, keywords):
    queries.append(" ".join(words))

## Set Start/End dates

In [8]:
def get_search_date_tuple(start_month, start_day, start_year, end_month, end_day, end_year):
    return (datetime.datetime(start_year, start_month, start_day, 0, tzinfo = datetime.timezone(datetime.timedelta(hours = 0))),
      datetime.datetime(end_year, end_month, end_day, 0, tzinfo = datetime.timezone(datetime.timedelta(hours = 0))))

In [9]:
start_date, end_date = get_search_date_tuple(9,3, 2020, 9,7,2020)
print(start_date, end_date)

2020-09-03 00:00:00+00:00 2020-09-07 00:00:00+00:00


## Perform Collection

In [11]:
consumer_key = 
consumer_secret = 
access_key = 
access_secret = 


In [12]:
collector = TweetCollector(consumer_key, consumer_secret, access_key, access_secret)

In [13]:
result, log = collector.reverse_chronological_collection(queries, start_date, end_date)

Reverse collecting for (covid OR coronavirus OR covid19) (hydroxychloroquine OR plaquenil)
Collected 95 from 2020-09-06-19-21 GMT to 2020-09-07-00-00 GMT for query: (covid OR coronavirus OR covid19) (hydroxychloroquine OR plaquenil)
Collected 80 from 2020-09-06-16-37 GMT to 2020-09-06-19-21 GMT for query: (covid OR coronavirus OR covid19) (hydroxychloroquine OR plaquenil)
Collected 83 from 2020-09-06-14-44 GMT to 2020-09-06-16-37 GMT for query: (covid OR coronavirus OR covid19) (hydroxychloroquine OR plaquenil)
Uh-oh, error:
Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/search/tweets.json?q=%28covid+OR+coronavirus+OR+covid19%29+%28hydroxychloroquine+OR+plaquenil%29+-filter%3Aretweets+lang%3Aen+until%3A1599403488+since%3A1599091200&tweet_mode=extended&lang=en&count=100 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9a14d4b8d0>: Failed to establish a new connection: [Errno 8] no

### Save Tweet Jsons:
Might wanna set up a local folder to store logs and the tweet jsons.

In [3]:
folder = "tmp/"

In [13]:
collector.save_tweets_jsons(folder+"tweet_jsons")
collector.save_logs(folder+"logs")

# Get Tweets into a Dataframe

Dataframes are used for keeping only the relevant tweet attributes and working with the tweets

In [2]:
folder = "tmp/"

tweet_jsons = json.load(open(folder + "tweet_jsons.json"))

In [3]:
converter = TweetDataConverter(tweet_jsons)

In [4]:
df = converter.to_dataframe()

number of duplicate tweets = 0


In [5]:
df.shape

(9726, 22)

In [6]:
df.head()

Unnamed: 0,status.full_text,status.created_at,status.id_str,status.quoted_status_id_str,status.quoted_status,status.in_reply_to_status_id_str,status.in_reply_to_user_id_str,status.retweet_count,status.favorite_count,status.entities,...,author.id_str,author.name,author.description,author.location,author.verified,author.followers_count,author.friends_count,author.listed_count,author.favourites_count,author.statuses_count
0,@NikolovScience Background: https://t.co/8DHdq...,2020-08-19 08:10:00+00:00,1295996421699756033,,,1295927472450543616,884455440293269505,0,0,"{'hashtags': [], 'symbols': [], 'user_mentions...",...,58244415,Hans-Petter Bekeng,Human Being @ Planet Earth. Thinker and Ponder...,Planet Earth,False,256,2729,0,39,7623
1,@stonecold2050 Trump asked Pillow Guy to bring...,2020-08-19 08:10:40+00:00,1295996591401205760,,,1295925220205682688,780955609394884608,2,4,"{'hashtags': [], 'symbols': [], 'user_mentions...",...,864868527249010689,I'm an Extremely Stable Genius 2!🌊🌊🌊,I care about my country. I want to leave it be...,,False,5087,5550,4,83970,66405
2,@mungojelly Have you not noticed that you are ...,2020-08-19 08:12:00+00:00,1295996926056435712,,,1295940662563606529,5488202,0,0,"{'hashtags': [], 'symbols': [], 'user_mentions...",...,21335196,Ruth Heasman 🌷🦚🐉,"Anti-authoritarian, Business owner, Biohacker ...",england,False,1778,3821,28,34983,15047
3,@guardian The media is very reliable at not le...,2020-08-19 08:12:57+00:00,1295997167526645760,,,1295987751809175555,87818409,1,1,"{'hashtags': [{'text': 'Covid_19', 'indices': ...",...,198522559,Celio,"Randomly opinionated by default, don't take it...",🌍,False,735,634,11,10109,13537
4,@Cameron_Davis86 @jshell9985 @Itsyab0y04 @stil...,2020-08-19 08:13:45+00:00,1295997365581688832,,,1295905899031203841,828346273862410243,0,0,"{'hashtags': [{'text': 'HCQ', 'indices': [119,...",...,2706445465,Jordan,Für #Menschenrechte und #Rechtsstaatlichkeit. ...,,False,114,628,0,7833,14758


# Preprocess Tweets

Two main steps here:

1. Cleaning up the tweet texts - handle things like hashtags, mentions, punctuation, urls, etc.

2. Stemming and lemmatizing tweets, which are both ways of mapping a word to some common form (usually it's root/base form).

### Cleaning up Tweets

In [7]:
import pandas as pd
import pickle
from Script_Processing.preprocessing_custom import full_preprocess, bio_preprocess

In [8]:
df.shape

(9726, 22)

Dump ID's to test file

In [9]:
with open("study_tweet_ids.txt", "w") as f:
    f.write("\n".join(list(df['status.id_str'])))

Continue Preprocessing

In [9]:
df["clean_tweet"] = df["status.full_text"].apply(full_preprocess)

In [10]:
def concat_texts(*texts):
    to_join = [str(text or ' ') for text in texts]
    return " ".join(to_join)

In [13]:
# -- We have no article data, so we don't do this
#df["clean_article"] = [
#    concat_texts(*x) for x in zip(
#        df["headlines"].apply(full_preprocess), df["descriptions"].apply(full_preprocess)
#    )
#]

In [None]:
#df["clean_tweet_article"] = [concat_texts(*x) for x in zip(df["clean_tweet"], df["clean_article"])]

In [None]:
#df["clean_tweet_all_texts"] = [
#    concat_texts(*x) for x in zip(
#        df["clean_tweet_article"], df["replied_text"].apply(full_preprocess), df["quoted_text"].apply(full_preprocess)
#    )
#]

In [11]:
df["clean_bio"] = df["author.description"].apply(bio_preprocess)

In [12]:
df.shape

(9726, 24)

### Stemming and Lemmatizing

In [13]:
from Script_Processing.stem_and_lemmatize import StemLemmaWrapper

In [14]:
sl_wrapper = StemLemmaWrapper()

In [15]:
df["stemmed_all_texts"] = df["clean_tweet"].apply(sl_wrapper.stem)

In [16]:
df["lemmatized_all_texts"] = df["clean_tweet"].apply(sl_wrapper.lemmatize)

In [17]:
df["lemmatize_stem_all_texts"] = df["clean_tweet"].apply(sl_wrapper.lemmatize_then_stem)

# Export Dataframe

In [1]:
path = "temp_df.pkl"
#df.to_pickle(path)
import pandas as pd
df = pd.read_pickle(path)

## Starter Code for Topic Analysis

In [3]:
import Script_Processing.stop_words_creation as swc

In [10]:
domain_stopwords = ["covid19", "covid", "19", "coronavirus", "virus",
                    "hydroxychloroquin"]

In [5]:
domain_stop_word_dict = swc.create_and_get_domain_stop_words(domain_stopwords, save = True)

In [6]:
from Script_Processing.LDA_wrapper import LDA_Modeler

In [7]:
corpus = list(df["lemmatize_stem_all_texts"])

In [8]:
lda_modeler = LDA_Modeler(corpus, domain_stop_word_dict, "lemma_stem")

In [9]:
model = lda_modeler.perform_topic_modeling(20)

Topic 1: fda treat unlik_effect hospitalis_patient doctor dr_fauci question open_letter trump dr_anthoni
Topic 2: life_save 1_2 3_4 medic_librarian risch_4 support_dr document_articl coordin_suppress hcq_az total_crimin
Topic 3: think risch dr_harvey convalesc_plasma treatment effect help suppress particip research
Topic 4: march fda_revok emerg_author plasma patient clinic_trial treat peopl drug god
Topic 5: azithromycin_zinc efficaci treat patient trump plaquenil doctor scandal medic_opinion lab
Topic 6: peopl studi trial thousand effect hcq medium_suppress reason treatment work
Topic 7: patient prevent treat trump drug doctor cure peopl help life
Topic 8: trump kill promot drug patient presid_donald time signific fauci prescrib
Topic 9: work doctor peopl drug peopl_die trump like risk_death patient want
Topic 10: studi patient low_dose hospit_patient effect treat drug trump infecti_diseas trial
Topic 11: trump know work drug rate person doctor fda_approv plasma patient
Topic 12: tre