This script uses the twitter api wrapper Tweepy to pull tweets mentioning movies currently in theater or about to be released in theater. 

The script looks into the folder storing previously pulled twitter data for what's the most recent tweet retrieved in the previous session, and stop the api call when reaching that tweet.

For each movie, this script outputs a csv file containing the tweet text messages and their respective timestamp, aggregating tweets retrieved this session and all previous sessions, newest tweets on top.

An additional text file is saved for each movie, to track the id of the most tweet retrieved mentioning that movie.

In [1]:
# Dependencies
import tweepy
import pandas as pd
import numpy as np
from datetime import datetime
from hashtags import singletag, doubletag
import time

# Twitter API Keys
from config import (consumer_key, consumer_secret, access_token, access_token_secret)

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [2]:
def pulldata(movie, stopat, poster, text, raw_time_list, id_list):
#input argument is the movie hashtag string, and the most recent tweet id previusly retrieved for this movie

    oldest_tweet = None
    keeprunning = True
    repetition = 0 #a counter to display how many iterations have passed

    while(keeprunning == True): #keep running until the end condition is met and changes keeprunning false
        try:
            tweets = api.search("#"+movie, count=100, result_type="recent", max_id=oldest_tweet) #query 100 tweets each iteration, newest first

            for t in tweets["statuses"]: #goes through each of the 100 tweets returned
                if t["id"] <= stopat: #once the tweet id is less than (older) than the most recent tweet id from the last sesson, set keep running to false to stop the while loop
                    keeprunning = False
                else:
                    if(t["user"]["statuses_count"] > 5 
                        and t["user"]["statuses_count"] < 20000 #filter twitter account with too few or too many messages
                        and t["user"]["lang"] == "en" #only keep english accounts
                        and t["user"]["screen_name"] not in poster): #and only keep tweets from users not archived in this session

                        poster.append(t["user"]["screen_name"]) #append the user name to the list to prevent the same person's message from being archived this session
                        raw_time_list.append(t["created_at"]) 
                        id_list.append(t["id"]) 
                        message = t["text"] 
                        text.append(message)

                oldest_tweet = t["id"] - 1 #update oldest_tweet for the next query call
        except:
            continue

        repetition += 1
        now = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
        print(f"the time is now {now}, finished request {repetition} of {movie}. I'm still running, please don't close me.")
        time.sleep(5.1) #pause for 5.1 seconds. twitter api has a rate limit of 180 calls per 15 min
    print("all loops done")
    return poster, text, raw_time_list, id_list

In [3]:
def tweevie(movie): 
    with open(f"rawtweets/{movie}.txt", 'r') as text:
        latest_tweet = int(text.read()) #open the text file recording the id of the most recent tweet retrieved previously.
    
    poster=[]
    text=[]
    raw_time_list=[]
    id_list=[]
    
    (poster, text, raw_time_list, id_list) = pulldata(movie, latest_tweet, poster, text, raw_time_list, id_list) 
    #pass most recent tweet as an argument into the function, current sessions retrieval stops when reaching this id

    df = pd.DataFrame({"text": text, "rawtime": raw_time_list}) #make the data frame
    df = df[["text","rawtime"]] #order the columns

    old_df = pd.DataFrame(pd.read_csv(f"rawtweets/{movie}.csv")) #load data previously saved for this movie
    combined_df = pd.concat([df, old_df]).reset_index(drop = True) #concatenate the new df on top of the old one
    combined_df.to_csv(f"rawtweets/{movie}.csv", index=False, header=True)
    
    try: #include a try-except clause in case there were no new tweets (list is empty)
        latest_tweet = np.max(id_list) #max id from the tweets will be the most recent tweet id. Next retrieval session will stop at this id
    except:
        pass
    with open(f"rawtweets/{movie}.txt", 'w') as text:
        text.write(str(latest_tweet)) #write the latest tweet id to the text file for next session's use



In [4]:
def tweevie2(movie, movie_alt): #for handling movies with two official hashtags
    with open(f"rawtweets/{movie}.txt", 'r') as text:
        latest_tweet = int(text.read())
    
    poster=[]
    text=[]
    raw_time_list=[]
    id_list=[]
    
    (poster, text, raw_time_list, id_list) = pulldata(movie, latest_tweet, poster, text, raw_time_list, id_list)
    (poster, text, raw_time_list, id_list) = pulldata(movie_alt, latest_tweet, poster, text, raw_time_list, id_list) 
    #include returned lists from the first hashtag to append
    
    df = pd.DataFrame({"text": text, "rawtime": raw_time_list, "id":id_list}) #make the data frame, include id this time
    df = df[["text","rawtime","id"]] 
    df = df.sort_values(["id"], ascending=False).reset_index(drop=True) #sort by id so the data retrieved for the two hashtags are ordered by time, most recent first
    del df["id"] #drop the id column because it's not stored in the csv

    old_df = pd.DataFrame(pd.read_csv(f"rawtweets/{movie}.csv")) 
    combined_df = pd.concat([df, old_df]).reset_index(drop = True) 
    combined_df.to_csv(f"rawtweets/{movie}.csv", index=False, header=True)
    
    try:
        latest_tweet = np.max(id_list)
    except:
        pass
    with open(f"rawtweets/{movie}.txt", 'w') as text:
        text.write(str(latest_tweet)) 



In [5]:
for movie in singletag:
    tweevie(movie)

the time is now 08-04-2018 16:37:02, finished request 1 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:08, finished request 2 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:14, finished request 3 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:19, finished request 4 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:25, finished request 5 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:31, finished request 6 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:36, finished request 7 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:42, finished request 8 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:37:48, finished request 9 of InfinityWar. I'm still runnin

the time is now 08-04-2018 16:43:52, finished request 73 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:43:58, finished request 74 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:04, finished request 75 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:09, finished request 76 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:15, finished request 77 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:21, finished request 78 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:26, finished request 79 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:32, finished request 80 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:44:38, finished request 81 of InfinityWar. I'm sti

the time is now 08-04-2018 16:50:38, finished request 144 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:50:43, finished request 145 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:50:49, finished request 146 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:50:55, finished request 147 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:51:01, finished request 148 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:51:06, finished request 149 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:51:12, finished request 150 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:51:17, finished request 151 of InfinityWar. I'm still running, please don't close me.
the time is now 08-04-2018 16:51:23, finished request 152 of InfinityWar

the time is now 08-04-2018 16:57:19, finished request 6 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:24, finished request 7 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:30, finished request 8 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:36, finished request 9 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:41, finished request 10 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:47, finished request 11 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:52, finished request 12 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:57:58, finished request 13 of SorryToBotherYou. I'm still running, please don't close me.
the time is now 08-04-2018 16:58:04, finishe

the time is now 08-04-2018 17:03:59, finished request 42 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:05, finished request 43 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:10, finished request 44 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:16, finished request 45 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:21, finished request 46 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:27, finished request 47 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:33, finished request 48 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:38, finished request 49 of MammaMia2. I'm still running, please don't close me.
the time is now 08-04-2018 17:04:44, finished request 50 of MammaMia2. I'm still running, please

the time is now 08-04-2018 17:10:37, finished request 32 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:10:43, finished request 33 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:10:49, finished request 34 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:10:54, finished request 35 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:11:00, finished request 36 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:11:06, finished request 37 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:11:11, finished request 38 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:11:17, finished request 39 of MissionImpossible. I'm still running, please don't close me.
the time is now 08-04-2018 17:11

the time is now 08-04-2018 17:17:02, finished request 27 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:07, finished request 28 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:13, finished request 29 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:18, finished request 30 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:24, finished request 31 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:30, finished request 32 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:35, finished request 33 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:41, finished request 34 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:17:47, fin

the time is now 08-04-2018 17:23:32, finished request 96 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:23:38, finished request 97 of ChristopherRobin. I'm still running, please don't close me.
the time is now 08-04-2018 17:23:43, finished request 98 of ChristopherRobin. I'm still running, please don't close me.
all loops done
the time is now 08-04-2018 17:23:49, finished request 1 of DarkestMinds. I'm still running, please don't close me.
the time is now 08-04-2018 17:23:55, finished request 2 of DarkestMinds. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:01, finished request 3 of DarkestMinds. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:06, finished request 4 of DarkestMinds. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:12, finished request 5 of DarkestMinds. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:17, finished requ

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




the time is now 08-04-2018 17:24:29, finished request 1 of nevergoinback. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:35, finished request 2 of nevergoinback. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:40, finished request 3 of nevergoinback. I'm still running, please don't close me.
all loops done
the time is now 08-04-2018 17:24:46, finished request 1 of TheSpyWhoDumpedMe. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:52, finished request 2 of TheSpyWhoDumpedMe. I'm still running, please don't close me.
the time is now 08-04-2018 17:24:57, finished request 3 of TheSpyWhoDumpedMe. I'm still running, please don't close me.
the time is now 08-04-2018 17:25:03, finished request 4 of TheSpyWhoDumpedMe. I'm still running, please don't close me.
the time is now 08-04-2018 17:25:09, finished request 5 of TheSpyWhoDumpedMe. I'm still running, please don't close me.
the time is now 08-04-2018 17:25:14, 

In [6]:
for movie_tuple in doubletag:
    movie = movie_tuple[0] #each item in the list is a tuple, first item in the tuple is the main hashtag, second item is the alt hashtag
    movie_alt = movie_tuple[1]
    tweevie2(movie, movie_alt) 

the time is now 08-04-2018 17:30:25, finished request 1 of AntMan. I'm still running, please don't close me.
the time is now 08-04-2018 17:30:31, finished request 2 of AntMan. I'm still running, please don't close me.
the time is now 08-04-2018 17:30:37, finished request 3 of AntMan. I'm still running, please don't close me.
the time is now 08-04-2018 17:30:42, finished request 4 of AntMan. I'm still running, please don't close me.
all loops done
the time is now 08-04-2018 17:30:48, finished request 1 of AntManAndTheWasp. I'm still running, please don't close me.
the time is now 08-04-2018 17:30:54, finished request 2 of AntManAndTheWasp. I'm still running, please don't close me.
the time is now 08-04-2018 17:30:59, finished request 3 of AntManAndTheWasp. I'm still running, please don't close me.
the time is now 08-04-2018 17:31:05, finished request 4 of AntManAndTheWasp. I'm still running, please don't close me.
the time is now 08-04-2018 17:31:10, finished request 5 of AntManAndTheWa