This script uses the twitter api wrapper Tweepy to pull tweets mentioning movies currently in theater or about to be released in theater. 

The script looks into the folder storing previously pulled twitter data for what's the most recent tweet retrieved in the previous session, and stop the api call when reaching that tweet.

For each movie, this script outputs a csv file containing the tweet text messages and their respective timestamp, aggregating tweets retrieved this session and all previous sessions, newest tweets on top.

An additional text file is saved for each movie, to track the id of the most tweet retrieved mentioning that movie.

In [None]:
# Dependencies
import tweepy
import pandas as pd
import numpy as np
from datetime import datetime
from twitter_hashtag import singletag, doubletag
import time

# Twitter API Keys
from twitter_config import (consumer_key, consumer_secret, access_token, access_token_secret)

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [None]:
def pulldata(movie, stopat, poster, text, raw_time_list, id_list):
#input argument is the movie hashtag string, and the most recent tweet id previusly retrieved for this movie

    oldest_tweet = None
    keeprunning = True
    repetition = 0 #a counter to display how many iterations have passed

    while(keeprunning == True): #keep running until the end condition is met and changes keeprunning false
        try:
            tweets = api.search("#"+movie, count=100, result_type="recent", max_id=oldest_tweet) #query 100 tweets each iteration, newest first

            for t in tweets["statuses"]: #goes through each of the 100 tweets returned
                if t["id"] <= stopat: #once the tweet id is less than (older) than the most recent tweet id from the last sesson, set keep running to false to stop the while loop
                    keeprunning = False
                else:
                    if(t["user"]["statuses_count"] > 5 
                        and t["user"]["statuses_count"] < 20000 #filter twitter account with too few or too many messages
                        and t["user"]["lang"] == "en" #only keep english accounts
                        and t["user"]["screen_name"] not in poster): #and only keep tweets from users not archived in this session

                        poster.append(t["user"]["screen_name"]) #append the user name to the list to prevent the same person's message from being archived this session
                        raw_time_list.append(t["created_at"]) 
                        id_list.append(t["id"]) 
                        message = t["text"] 
                        text.append(message)

                oldest_tweet = t["id"] - 1 #update oldest_tweet for the next query call
        except:
            continue

        repetition += 1
        now = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
        print(f"the time is now {now}, finished request {repetition} of {movie}. I'm still running, please don't close me.")
        time.sleep(5.1) #pause for 5.1 seconds. twitter api has a rate limit of 180 calls per 15 min
    print("all loops done")
    return poster, text, raw_time_list, id_list

In [None]:
def tweevie(movie): 
    with open(f"rawtweets/{movie}.txt", 'r') as text:
        latest_tweet = int(text.read()) #open the text file recording the id of the most recent tweet retrieved previously.
    
    poster=[]
    text=[]
    raw_time_list=[]
    id_list=[]
    
    (poster, text, raw_time_list, id_list) = pulldata(movie, latest_tweet, poster, text, raw_time_list, id_list) 
    #pass most recent tweet as an argument into the function, current sessions retrieval stops when reaching this id

    df = pd.DataFrame({"text": text, "rawtime": raw_time_list}) #make the data frame
    df = df[["text","rawtime"]] #order the columns

    old_df = pd.DataFrame(pd.read_csv(f"rawtweets/{movie}.csv")) #load data previously saved for this movie
    combined_df = pd.concat([df, old_df]).reset_index(drop = True) #concatenate the new df on top of the old one
    combined_df.to_csv(f"rawtweets/{movie}.csv", index=False, header=True)
    
    try: #include a try-except clause in case there were no new tweets (list is empty)
        latest_tweet = np.max(id_list) #max id from the tweets will be the most recent tweet id. Next retrieval session will stop at this id
    except:
        pass
    with open(f"rawtweets/{movie}.txt", 'w') as text:
        text.write(str(latest_tweet)) #write the latest tweet id to the text file for next session's use



In [None]:
def tweevie2(movie, movie_alt): #for handling movies with two official hashtags
    with open(f"rawtweets/{movie}.txt", 'r') as text:
        latest_tweet = int(text.read())
    
    poster=[]
    text=[]
    raw_time_list=[]
    id_list=[]
    
    (poster, text, raw_time_list, id_list) = pulldata(movie, latest_tweet, poster, text, raw_time_list, id_list)
    (poster, text, raw_time_list, id_list) = pulldata(movie_alt, latest_tweet, poster, text, raw_time_list, id_list) 
    #include returned lists from the first hashtag to append
    
    df = pd.DataFrame({"text": text, "rawtime": raw_time_list, "id":id_list}) #make the data frame, include id this time
    df = df[["text","rawtime","id"]] 
    df = df.sort_values(["id"], ascending=False).reset_index(drop=True) #sort by id so the data retrieved for the two hashtags are ordered by time, most recent first
    del df["id"] #drop the id column because it's not stored in the csv

    old_df = pd.DataFrame(pd.read_csv(f"rawtweets/{movie}.csv")) 
    combined_df = pd.concat([df, old_df]).reset_index(drop = True) 
    combined_df.to_csv(f"rawtweets/{movie}.csv", index=False, header=True)
    
    try:
        latest_tweet = np.max(id_list)
    except:
        pass
    with open(f"rawtweets/{movie}.txt", 'w') as text:
        text.write(str(latest_tweet)) 



In [None]:
for movie in singletag:
    tweevie(movie)

In [None]:
for movie_tuple in doubletag:
    movie = movie_tuple[0] #each item in the list is a tuple, first item in the tuple is the main hashtag, second item is the alt hashtag
    movie_alt = movie_tuple[1]
    tweevie2(movie, movie_alt) 