In [0]:
# Made changes to the GetOldTweets python3 library and added ability
# to search within location.

In [0]:
from GetOldTweets_python import got3
from datetime import datetime, timedelta
import pandas as pd
import urllib
import time, csv, os
from collections import Counter
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
import matplotlib.pyplot as plt
import seaborn as sns
import re
import api_keys
import twitter

In [0]:
data_dir = 'data'

In [0]:
# Clean up the city, state dataframe to get a list of cities + states of interest

states = {
    'Alabama': 'Montgomery',
    'Alaska': 'Juneau',
    'Arizona':'Phoenix',
    'Arkansas':'Little Rock',
    'California': 'Sacramento',
    'Colorado':'Denver',
    'Connecticut':'Hartford',
    'Delaware':'Dover',
    'Florida': 'Tallahassee',
    'Georgia': 'Atlanta',
    'Hawaii': 'Honolulu',
    'Idaho': 'Boise',
    'Illinois': 'Springfield',
    'Indiana': 'Indianapolis',
    'Iowa': 'Des Monies',
    'Kansas': 'Topeka',
    'Kentucky': 'Frankfort',
    'Louisiana': 'Baton Rouge',
    'Maine': 'Augusta',
    'Maryland': 'Annapolis',
    'Massachusetts': 'Boston',
    'Michigan': 'Lansing',
    'Minnesota': 'St. Paul',
    'Mississippi': 'Jackson',
    'Missouri': 'Jefferson City',
    'Montana': 'Helena',
    'Nebraska': 'Lincoln',
    'Nevada': 'Carson City',
    'New Hampshire': 'Concord',
    'New Jersey': 'Trenton',
    'New Mexico': 'Santa Fe',
    'New York': 'Albany',
    'North Carolina': 'Raleigh',
    'North Dakota': 'Bismarck',
    'Ohio': 'Columbus',
    'Oklahoma': 'Oklahoma City',
    'Oregon': 'Salem',
    'Pennsylvania': 'Harrisburg',
    'Rhode Island': 'Providence',
    'South Carolina': 'Columbia',
    'South Dakota': 'Pierre',
    'Tennessee': 'Nashville',
    'Texas': 'Austin',
    'Utah': 'Salt Lake City',
    'Vermont': 'Montpelier',
    'Virginia': 'Richmond',
    'Washington': 'Olympia',
    'West Virginia': 'Charleston',
    'Wisconsin': 'Madison',
    'Wyoming': 'Cheyenne'  
}
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}
us_state_size = {
    'Alabama': 50750,
    'Alaska': 570641,
    'Arizona': 113642,
    'Arkansas': 52075,
    'California': 155973,
    'Colorado': 103730,
    'Connecticut': 4845,
    'Delaware': 1955,
    'Florida': 53997,
    'Georgia': 57919,
    'Hawaii': 6423,
    'Idaho': 82751,
    'Illinois': 55593,
    'Indiana': 35870,
    'Iowa': 55875,
    'Kansas': 81823,
    'Kentucky': 39732,
    'Louisiana': 43566,
    'Maine': 30865,
    'Maryland': 9775,
    'Massachusetts': 7838,
    'Michigan': 56539,
    'Minnesota': 79617,
    'Mississippi': 46914,
    'Missouri': 68898,
    'Montana': 145556,
    'Nebraska': 76878,
    'Nevada': 109806,
    'New Hampshire': 8969,
    'New Jersey': 7419,
    'New Mexico': 121365,
    'New York': 47224,
    'North Carolina': 48718,
    'North Dakota': 68994,
    'Ohio': 40953,
    'Oklahoma': 68679,
    'Oregon': 96003,
    'Pennsylvania': 44820,
    'Rhode Island': 1034,
    'South Carolina': 30111,
    'South Dakota': 75898,
    'Tennessee': 41220,
    'Texas': 261914,
    'Utah': 82168,
    'Vermont': 9249,
    'Virginia': 39598,
    'Washington': 66582,
    'West Virginia': 24087,
    'Wisconsin': 54314,
    'Wyoming': 97105,
}

In [0]:
# Function to scrape tweets and write them to a csv file
def get_tweets(start_date, end_date, interval, search_term, location, within, 
               nTweets, ftag=""):
    # Generate csv file name
    if len(ftag) > 0:
        ftag = "_" + ftag

    fname = start_date + "_" + end_date + ftag + "_tweets.csv"
    fname = os.path.join(data_dir, fname)
    
    # Generate lists of start and end dates
    date_range = [d.strftime('%Y-%m-%d') for d in 
                  pd.date_range(start=start_date, end=end_date, freq=interval)]
    date_range.insert(0, start_date)
    start_date_range = date_range[:-1]
    end_date_range = date_range[1:]

    print('Scraping data from {}'.format(location))

    for i, (start_date, end_date) in enumerate(zip(start_date_range, end_date_range)):
        print('Processing {}/{} periods'.format(i+1, len(start_date_range)))
        
        time.sleep(10)
        # Create tweetCriteria object
        tweetCriteria = got3.manager.TweetCriteria()
        # if location is specified
        if location:
            tweetCriteria.setNear(location).setWithin(within)
    
        tweetCriteria.setQuerySearch(search_term).setMaxTweets(nTweets)
        tweetCriteria.setSince(start_date).setUntil(end_date)
        results = got3.manager.TweetManager.getTweets(tweetCriteria)

        # print number of results returned
        print('{} tweets found'.format(len(results)))
        with open(fname, 'a') as f:
            writer = csv.writer(f)
            for result in results:
                date = result.date.strftime('%Y-%m-%d')
                values = [result.id, result.permalink, result.username, 
                          result.text, date, result.retweets, 
                          result.favorites, result.mentions, result.hashtags, 
                          result.geo, location, search_term]
                
                csv_writer(writer, values, os.path.isfile(fname))

In [0]:
# Function to write rows to csv
def csv_writer(writer, values, file_exist):
    if not file_exist:
        writer.writeheader()
    writer.writerow(values)

In [0]:
# Search terms to scrape tweets
search_terms = ['#flushot', 'flu shot', 'flu shots', 'flu', 'flu vaccine', 'flu vaccines', 
                'influenza', 'flu vaccinations']

In [0]:
# Due to time concern, only scraping 3 terms for now
search_term = search_terms[0]
print('query for {}'.format(search_term))
for i, (state, city) in enumerate(states.items()):
    location = '"' + city + ', ' + us_state_abbrev[state] + '"'
    radius = str(int(us_state_size[state] ** (0.5))) + 'mi'
    get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
            location, radius, 500)

search_term = search_terms[3]
print('query for {}'.format(search_term))
for i, (state, city) in enumerate(states.items()):
    location = '"' + city + ', ' + us_state_abbrev[state] + '"'
    radius = str(int(us_state_size[state] ** (0.5))) + 'mi'
    get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
            location, radius, 500)

search_term = search_terms[4]
print('query for {}'.format(search_term))
for i, (state, city) in enumerate(states.items()):
    location = '"' + city + ', ' + us_state_abbrev[state] + '"'
    radius = str(int(us_state_size[state] ** (0.5))) + 'mi'
    get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
            location, radius, 500)


In [0]:
# Since most tweets do not have geotag, scrape tweets without location as well
search_term = '#gotmyflushot'
print('query for {}'.format(search_term))
get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
           location=None, within=None, nTweets=3000, ftag='noLocation')

search_term = 'got my flu shot'
print('query for {}'.format(search_term))
get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
           location=None, within=None, nTweets=3000, ftag='noLocation')

search_term = 'flu symptom'
print('query for {}'.format(search_term))
get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
           location=None, within=None, nTweets=3000, ftag='noLocation')

search_term = 'flu sick'
print('query for {}'.format(search_term))
get_tweets('2015-1-1', '2018-11-1', 'M', search_term, 
           location=None, within=None, nTweets=3000, ftag='noLocation')

search_term = 'get your flu shot'
print('query for {}'.format(search_term))
get_tweets('2015-1-1', '2018-11-1', 'Y', search_term, 
           location=None, within=None, nTweets=30000, ftag='noLocation')