In [43]:
# ### Mount Drive

# from google.colab import drive
# drive.mount('/content/drive/')

In [44]:
### Imports

import os, requests, json
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn import preprocessing
import sys

In [45]:
### Constants

WEIGHTS = {"retweet" : 3, "like" : 0.5 ,"quote" : 4 ,"reply" : 1}

In [46]:
bearer_token = "AAAAAAAAAAAAAAAAAAAAAGXdTwEAAAAAr2%2BC9Wi6GHR8%2Bk%2FiDL2AIHaC1I8%3D86fg9nIXAt2MFp0QP1sXU0q1VFKHAGaD1da68qG4X0glvGSh4D"

def response_health(r):
  if r.status_code != 200:
    raise Exception(
    "Request returned an error: {} {}".format(
      r.status_code, r.text
    )
  )
    
def bearer_oauth(r):
  r.headers["Authorization"] = f"Bearer {bearer_token}"
  return r

def send_request(url, params=None, print_status=False):
  '''Send Request (url) with optional params. Returns json'''
  # https://2.python-requests.org/en/master/api/#requests.request
  if params == None:
    response = requests.request("GET", url, auth=bearer_oauth)
  else:
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
  if print_status: print("Request response status: ", response.status_code)
  response_health(response)
  return response.json()

In [157]:
def get_user_data(name):
  # data dictionary scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/users/lookup/api-reference/get-users-by-username-username

  userFields = {"user.fields":"created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld".replace(" ", "")}
  user_json = send_request(f"https://api.twitter.com/2/users/by/username/{name}",params=userFields)
  user_json = user_json["data"]

  outputDict = {}
  outputDict['following_count'] = user_json['public_metrics']['following_count']
  outputDict['tweet_count'] = user_json['public_metrics']['tweet_count']
  outputDict['followers_count'] = user_json['public_metrics']['followers_count']
  outputDict['listed_count'] = user_json['public_metrics']['listed_count']
  outputDict['handle'] = user_json['username']
  outputDict['name'] = user_json['name']
  outputDict['id'] = user_json['id']
  outputDict['verified'] = user_json['verified']
  outputDict['protected'] = user_json['protected']
  outputDict['created_at'] = user_json['created_at']
  outputDict['description'] = user_json['description']

  try:
    test = user_json['pinned_tweet_id']
    outputDict['hasPinnedTweet'] = True
  except:
    outputDict['hasPinnedTweet'] = False
    pass
  try:
    outputDict['urlsInDescription'] = len(user_json['entities']['description']['urls'])
  except:
    outputDict['urlsInDescription'] = 0
    pass
  try:
    outputDict['hashtagsInDescription'] = len(user_json['entities']['description']['hashtags'])
  except:
    outputDict['hashtagsInDescription'] = 0
    pass

  try:
    outputDict['userWebsitesAdded'] = len(user_json['entities']['url']['urls'])
  except:
    outputDict['userWebsitesAdded'] = 0
    pass

  try:
    outputDict['cashtagsInDescription'] = len(user_json['entities']['description']['cashtags'])
  except:
    outputDict['cashtagsInDescription'] = 0
    pass

  try:
    outputDict['mentionsInDescription'] = len(user_json['entities']['description']['mentions'])
  except:
    outputDict['mentionsInDescription'] = 0
    pass

  
  return outputDict

In [170]:
def get_tweets_user(id, numTweets = 10, tweetsPerPage = 10, replies = False, weights=WEIGHTS, paginationToken = None):
    import math
    
    # (there are 10 results returned per page by default)
    if numTweets < tweetsPerPage:
        print("numTweets must be greater than or equal to the number of tweets per page.")
        return

    # to see data dictionary, click url and scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets
    expansions = {"expansions":"author_id, attachments.poll_ids, attachments.media_keys, entities.mentions.username, geo.place_id, in_reply_to_user_id, referenced_tweets.id,referenced_tweets.id.author_id".replace(" ", "")}
    tweetFields = {"tweet.fields":"attachments, author_id, context_annotations, conversation_id, created_at, entities, geo, id, in_reply_to_user_id, lang, public_metrics, possibly_sensitive, referenced_tweets, reply_settings, source, text, withheld".replace(" ", "")}
    userFields = {"user.fields":"public_metrics,username"}
    replyFields = {"exclude": "replies"}

    outputDict = {'id':[],'handle':[],'followers':[], 'text':[], 'lang':[],'possibly_sensitive':[],'retweet_count':[],'reply_count':[],'like_count':[],'quote_count':[]
        ,'reply_settings':[],'source':[],'created_at':[],'is_retweet':[],'contains_quote':[],'is_reply':[],'num_referenced_tweets':[],
        'url_image':[],'num_hashtags':[],'text_first_hashtag':[],'num_mentions':[],'num_cashtags':[],'num_polls':[]}

    if not replies:
        outputDict['interaction_score'] = []

    token = ""
    
    if paginationToken != None: 
        token = paginationToken
        
    # for each page of results
    for i in range(math.ceil(numTweets/tweetsPerPage)): 
        if i != 0 or paginationToken != None:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":token}}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":token}, **replyFields}
        else:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields, **replyFields}


        tweet_json = (send_request(f"https://api.twitter.com/2/users/{id}/tweets", params=params))

        # handle cases where there is no next page of tweets to grab
        try:
            token = tweet_json['meta']['next_token']
        except: 
            token = None # in this case, there is no page to grab next.
            pass
        if token == None:
            break

        username = tweet_json['includes']['users'][0]['username']
        followers = tweet_json['includes']['users'][0]['public_metrics']['followers_count']
        
        tweetData = tweet_json['data']
        for tweet in tweetData:
            outputDict['handle'].append(username)
            outputDict['followers'].append(followers)
            outputDict['id'].append(tweet['id'])
            outputDict['text'].append(tweet['text'])
            outputDict['lang'].append(tweet['lang'])
            outputDict['possibly_sensitive'].append(tweet['possibly_sensitive'])

            outputDict['retweet_count'].append(tweet['public_metrics']['retweet_count'])
            outputDict['reply_count'].append(tweet['public_metrics']['reply_count'])
            outputDict['like_count'].append(tweet['public_metrics']['like_count'])
            outputDict['quote_count'].append(tweet['public_metrics']['quote_count'])
            outputDict['reply_settings'].append(tweet['reply_settings'])
            outputDict['source'].append(tweet['source'])
            outputDict['created_at'].append(tweet['created_at'])

            if not replies:
                outputDict['interaction_score'].append((weights['retweet'] *outputDict['retweet_count'][-1] + weights['like'] *outputDict['like_count'][-1]+
                                                        weights['reply'] *outputDict['reply_count'][-1] + weights['quote'] *outputDict['quote_count'][-1])/followers)


            # referenced tweets: quotes, replies, and retweets
            try:
                refdTweets = tweet['referenced_tweets']
                outputDict['num_referenced_tweets'].append(len(refdTweets))

                rtweet = False
                reply = False
                quote = False
                # there may be multiple referenced tweets, apparently. So it could be a reply and contain a quote, I guess
                for t in refdTweets:
                    typ = t['type']
                    if typ == 'retweeted':
                        outputDict['is_retweet'].append(True)
                        rtweet = True
                    elif typ == 'quoted':
                        outputDict['contains_quote'].append(True)
                        quote = True
                    elif typ == 'replied_to':
                        outputDict['is_reply'].append(True)
                        reply = True
                        
                if not rtweet:
                        outputDict['is_retweet'].append(False)
                if not reply:
                        outputDict['is_reply'].append(False)
                if not quote:
                        outputDict['contains_quote'].append(False)

            except:
                outputDict['num_referenced_tweets'].append(0)
                outputDict['is_retweet'].append(False)
                outputDict['contains_quote'].append(False)
                outputDict['is_reply'].append(False)
                pass


            # image
            try:
                outputDict['url_image'].append(tweet['entities']['urls'][0]['images'][0]['url'])  #just grabbing the first image in the first url
            except:
                outputDict['url_image'].append("")  
                pass

            # hashtags
            try:
                outputDict['num_hashtags'].append(len(tweet['entities']['hashtags']))
                # grabbing just the first hashtag
                outputDict['text_first_hashtag'].append(tweet['entities']['hashtags'][0]['tag'])
            except:
                outputDict['num_hashtags'].append(0) 
                outputDict['text_first_hashtag'].append("")   
                pass

            # mentions
            try:
                outputDict['num_mentions'].append(len(tweet['entities']['mentions']))
            except:
                outputDict['num_mentions'].append(0) 
                pass

            # cashtags
            try:
                outputDict['num_cashtags'].append(len(tweet['entities']['cashtags']))
            except:
                outputDict['num_cashtags'].append(0) 
                pass
            
            # polls
            try:
                outputDict['num_polls'].append(len(tweet['attachments']['poll_ids']))
            except:
                outputDict['num_polls'].append(0) 
                pass
        
    df = pd.DataFrame(outputDict)


    return df, token


In [173]:
def get_api_data(usernames, replies = False, numTweets = 10, tweetsPerPage = 10, print_status = False):
    import time
    import traceback
    usersDict =  {'following_count':[], 'tweet_count':[], 'followers_count':[], 'listed_count':[], 'handle':[], 'name':[], 'id':[], 
    'verified': [], 'protected': [],'created_at': [],'description': [], 'hasPinnedTweet':[], 'urlsInDescription':[], 'hashtagsInDescription':[],
    'userWebsitesAdded':[], 'cashtagsInDescription':[],'mentionsInDescription':[]}

    idDict = {}

    for username in usernames:
        # get data related to user account
        userData = get_user_data(username)
        
        # create id - username mapping dictionary for use in next loop
        idDict[userData['handle']] = userData['id']
        
        # build the user data dataframe
        for k, v in userData.items():
            usersDict[k].append(v)

    df_user = pd.DataFrame(usersDict)
    
    if not replies:
        #if we are excluding replies, we want to keep pulling until we have at least 500 images
        numImages = 0
        tokenDict = {}
        stopList = []
        nextToken = ""
        firstIteration = True
        loopCount = 1
        df_tweets = None
        while (numImages < 500):
            for username in usernames:
                # get data from tweets of the user

                # handle cases where there are no more tweets to pull 
                if set(stopList) == set(usernames):
                    df_tweets.set_index('id', inplace=True)
                    df_user.set_index('handle', inplace=True)
                    df_tweets.sort_values('handle', inplace=True)
                    return df_tweets, df_user
                elif stopList.count(username) > 0:
                    continue

                # if we left off on a page, then jump to the next page of tweet results
                try:
                    nextToken = tokenDict[username]
                    try:
                        df_sub,token = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies, paginationToken=nextToken)
                    except Exception:
                        traceback.print_exc()
                except: 
                    df_sub,token = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)

                tokenDict[username] = token

                # handle cases where there are no more tweets to pull 
                if token == None:
                    stopList.append(username)

                if not firstIteration:
                    df_tweets = pd.concat([df_tweets, df_sub])
                else:
                    df_tweets = df_sub
                    firstIteration = False
                if print_status: 
                    num_tweets = len(df_sub)
                    print(f"Retrieved {num_tweets} tweets for: {username}")
            numImages = len(df_tweets[df_tweets.url_image != ""])
            if print_status: print(f"{loopCount} iterations through while loop. {numImages} images retrieved.")
            if loopCount % 3 == 0: # if it is a multiple of 3 
                # we have a limit of 900 requests per 15 minute window.
                print("waiting 5 minutes...")
                time.sleep(60*5) # wait 5 minutes
            loopCount += 1
    else:
        df_tweets = None
        firstIteration = True
        for username in usernames:
            # get data from tweets of the user
            df_sub = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)[0]

            if not firstIteration:
                df_tweets = pd.concat([df_tweets, df_sub])
            else:
                df_tweets = df_sub
                firstIteration = False
            if print_status: 
                num_tweets = len(df_sub)
                print(f"Retrieved {num_tweets} tweets for: {username}")
        

    df_tweets.set_index('id', inplace=True)
    df_user.set_index('handle', inplace=True)
    df_tweets.sort_values('handle', inplace=True)

    return df_tweets, df_user



In [150]:
df = pd.read_csv('fast-food-chains - Sheet1.csv')

df.drop(df.columns[0], axis=1, inplace=True)
df.loc[df.shape[0]] = df.columns
df.columns = ["handle"]
twitter_handles = df[df.columns[0]].tolist()
print(len(twitter_handles))
twitter_handles

96


['Schlotzskys',
 'AuntieAnnes',
 'SaltgrassSteak',
 'redlobster',
 'Hardees',
 'RuthsChris',
 'LongHornSteaks',
 'FiveGuys',
 'Applebees',
 'DelTaco',
 'PFChangs',
 'BonefishGrill',
 'Charleys',
 'EinsteinBros',
 'qdoba',
 'torchystacos',
 'raisingcanes',
 'Cheesecake',
 'WaffleHouse',
 'CheckersRallys',
 'SmoothieKing',
 'CaptainDs',
 'WhiteCastle',
 'papamurphys',
 'caferio',
 'ChuysRestaurant',
 'ChurchsChicken',
 'IHOP',
 'BaskinRobbins',
 'TSmoothieCafe',
 'MODPizza',
 'calpizzakitchen',
 'FreddysUSA',
 'SteaknShake',
 'tacojohns',
 'Dickeys',
 'krispykreme',
 'ElPolloLoco',
 'ColdStone',
 'Whataburger',
 'Hooters',
 'Maggianos',
 'hungryhowies',
 'noodlescompany',
 'Carrabbas',
 'shakeshack',
 'jimmyjohns',
 'portilloshotdog',
 'culvers',
 'redrobinburgers',
 'goldencorral',
 'eatatjacks',
 'McAlistersDeli',
 'rubytuesday',
 'BobEvansFarms',
 'CHWinery',
 'jasonsdeli',
 'longjohnsilvers',
 'TGIFridays',
 'Potbelly',
 'wingstop',
 'JambaJuice',
 'cheddarskitchen',
 'CapitalGrille'

In [174]:
reply_tweetDf, userDf = get_api_data(twitter_handles, replies = True, numTweets = 80, tweetsPerPage = 80, print_status=True)
noreply_tweetDf, userDf = get_api_data(twitter_handles, replies = False, numTweets = 300, tweetsPerPage = 100,print_status=True)

# reply_tweetDf, userDf = get_api_data(['McDonalds'], replies = True, numTweets = 80, tweetsPerPage = 80, print_status=True)
# noreply_tweetDf, userDf = get_api_data(['McDonalds', 'RuthsChris'], replies = False, numTweets = 300, tweetsPerPage = 100,print_status=True)

Retrieved 80 tweets for: Schlotzskys
Retrieved 80 tweets for: AuntieAnnes
Retrieved 80 tweets for: SaltgrassSteak
Retrieved 80 tweets for: redlobster
Retrieved 80 tweets for: Hardees
Retrieved 80 tweets for: RuthsChris
Retrieved 80 tweets for: LongHornSteaks
Retrieved 80 tweets for: FiveGuys
Retrieved 80 tweets for: Applebees
Retrieved 80 tweets for: DelTaco
Retrieved 80 tweets for: PFChangs
Retrieved 80 tweets for: BonefishGrill
Retrieved 80 tweets for: Charleys
Retrieved 80 tweets for: EinsteinBros
Retrieved 80 tweets for: qdoba
Retrieved 80 tweets for: torchystacos
Retrieved 80 tweets for: raisingcanes
Retrieved 80 tweets for: Cheesecake
Retrieved 80 tweets for: WaffleHouse
Retrieved 80 tweets for: CheckersRallys
Retrieved 80 tweets for: SmoothieKing
Retrieved 80 tweets for: CaptainDs
Retrieved 80 tweets for: WhiteCastle
Retrieved 80 tweets for: papamurphys
Retrieved 80 tweets for: caferio
Retrieved 80 tweets for: ChuysRestaurant
Retrieved 80 tweets for: ChurchsChicken
Retrieved 80 

In [175]:
noreply_tweetDf.duplicated().sum()

0

In [176]:
noreply_tweetDf.shape[0]

76244

In [177]:
noreply_tweetDf

Unnamed: 0_level_0,handle,followers,text,lang,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,reply_settings,...,contains_quote,is_reply,num_referenced_tweets,url_image,num_hashtags,text_first_hashtag,num_mentions,num_cashtags,num_polls,interaction_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1111396547353341952,Applebees,599599,I’ll never forget the friends I made at happy ...,en,False,10,3,52,2,everyone,...,False,False,0,,0,,0,0,0,0.000112
1134206851682377728,Applebees,599599,mimosas are the champagne of OJ,en,False,6,1,53,1,everyone,...,False,False,0,,0,,0,0,0,0.000083
1134208864180736001,Applebees,599599,If there had been a boneless wings club in hig...,en,False,29,10,109,10,everyone,...,False,False,0,,0,,0,0,0,0.000319
1134211884570296320,Applebees,599599,Beer should start a podcast,en,False,9,6,54,3,everyone,...,False,False,0,,0,,0,0,0,0.000120
1134214148538142725,Applebees,599599,best shot/worst shot?,en,False,1,18,40,9,everyone,...,False,False,0,,0,,0,0,0,0.000128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250441241340162048,wingstop,263463,Take advantage of that free delivery fam at ht...,en,False,2,8,28,0,everyone,...,True,False,1,,0,,0,0,0,0.000106
1250499200804179979,wingstop,263463,Free Wingstop delivery has no competition. htt...,en,False,67,16,339,12,everyone,...,False,False,0,,0,,0,0,0,0.001649
1250544447798104067,wingstop,263463,Wingstop + _________. Fill in the blank for ho...,en,False,6,15,39,10,everyone,...,False,False,0,,1,WingingitFromHome,0,0,0,0.000351
1289274555794677760,wingstop,263463,Salute to ya (^-^)ゝ https://t.co/fhdPMcRFO8,en,False,252,8,1034,36,everyone,...,False,False,0,,0,,0,0,0,0.005409


In [178]:
noreply_tweetDf.to_csv('noReplies.csv')
reply_tweetDf.to_csv('replies.csv')
userDf.to_csv('userData.csv')

In [None]:
len(noreply_tweetDf[noreply_tweetDf.url_image != ""])

251