# Retrieve tweets using Tweepy and store in a Pandas dataframe

In [None]:
import tweepy as tw
import pandas as pd
import json

In [None]:
from assertpy import assert_that    # Better assertions

A pre-requisite is to set up developer access for your Twitter account and then create a Twitter application that will generate the API credentials we will use below to access the Twitter API. 

The code below assume you have stored the credentials in a "keys.json" file

In [None]:
with open("keys.json", "r") as f:
    keys = json.load(f)

In [None]:
consumer_key= keys["consumer"]["api"]
consumer_secret= keys["consumer"]["secret"]
access_token= keys["access"]["token"]
access_token_secret= keys["access"]["secret"]

Create a function that authenticates you with Twitter and returns a tweepy.API object. I've put this in a function as I will refresh the session with Twitter intermittently. This is probably not needed but a blog post I read suggested that long-running session tend to slow down after a while. 

In [None]:
def tw_api():
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tw.API(auth, wait_on_rate_limit=True)
    return api

Read in search terms from a file

In [None]:
with open("keywords.txt", "r") as f:
    keywords = [word.strip('\n') for word in f.readlines()]

Function to create the query string we will use for a given search term

In [None]:
def search_query(search_string):
    """
    Query for both search_string and #search_string.
    If search_string contains spaces then wrap in double quotes (and remove spaces from hashtag).
    Filter out any retweets
    """
    words = search_string.split()
    query = '\"' + search_string + '\"' if len(words) > 1 else search_string
    query = query + ' #' + "".join(words)
    query = query + " -filter:retweets"
    return query


Some quick tests

In [None]:
assert_that(search_query("wildfires")).is_equal_to("wildfires #wildfires -filter:retweets")
assert_that(search_query("body bag")).is_equal_to('"body bag" #bodybag -filter:retweets')

Iterate over keywords, create search query, perform search and store results in a Pandas dataframe
Every 10 keywords, refresh session with Twitter (see above)

In [None]:
dataframes = []
count = 0
for keyword in keywords:
    if count % 10 == 0:
        api = tw_api()
    count += 1
    print("(%d/%d) Querying for %s" % (count, len(keywords), keyword))
    cursor = tw.Cursor(api.search,
              q=search_query(keyword),
              count=100,
              tweet_mode='extended',
              lang="en")
    tweets = cursor.items(100)
    data = [[tweet.full_text.encode("ascii", "ignore").decode(), keyword] for tweet in tweets]
    df = pd.DataFrame(columns=["text", "search_term"], data=data)
    dataframes.append(df)

Concatenate the keyword specific dataframes into one uber dataframe

In [None]:
tweets_df = pd.concat(dataframes, ignore_index=True)
tweets_df.head(5)

In [None]:
tweets_df.info()

Strip any extraneous whitespace and drop any duplicate tweets

In [None]:
tweets_df.text = tweets_df.text.apply(lambda s: s.strip())

In [None]:
tweets_df = tweets_df.drop_duplicates(subset='text', keep="first")

In [None]:
tweets_df.info()

Write to file

In [None]:
tweets_df.to_csv("tweets.csv")