In [34]:
# ### Mount Drive

# from google.colab import drive
# drive.mount('/content/drive/')

In [35]:
### Imports

import os, requests, json
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn import preprocessing
import sys

In [36]:
### Constants

WEIGHTS = {"retweet" : 3, "like" : 0.5 ,"quote" : 4 ,"reply" : 1}

In [37]:
bearer_token = "AAAAAAAAAAAAAAAAAAAAAGXdTwEAAAAAr2%2BC9Wi6GHR8%2Bk%2FiDL2AIHaC1I8%3D86fg9nIXAt2MFp0QP1sXU0q1VFKHAGaD1da68qG4X0glvGSh4D"

def response_health(r):
  if r.status_code != 200:
    raise Exception(
    "Request returned an error: {} {}".format(
      r.status_code, r.text
    )
  )
    
def bearer_oauth(r):
  r.headers["Authorization"] = f"Bearer {bearer_token}"
  return r

def send_request(url, params=None, print_status=False):
  '''Send Request (url) with optional params. Returns json'''
  # https://2.python-requests.org/en/master/api/#requests.request
  if params == None:
    response = requests.request("GET", url, auth=bearer_oauth)
  else:
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
  if print_status: print("Request response status: ", response.status_code)
  response_health(response)
  return response.json()

In [38]:
def get_user_data(name):
  # data dictionary scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/users/lookup/api-reference/get-users-by-username-username

  userFields = {"user.fields":"created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld".replace(" ", "")}
  user_json = send_request(f"https://api.twitter.com/2/users/by/username/{name}",params=userFields)
  user_json = user_json["data"]

  outputDict = {}
  outputDict['following_count'] = user_json['public_metrics']['following_count']
  outputDict['tweet_count'] = user_json['public_metrics']['tweet_count']
  outputDict['followers_count'] = user_json['public_metrics']['followers_count']
  outputDict['listed_count'] = user_json['public_metrics']['listed_count']
  outputDict['username'] = user_json['username']
  outputDict['name'] = user_json['name']
  outputDict['id'] = user_json['id']
  outputDict['verified'] = user_json['verified']
  outputDict['protected'] = user_json['protected']
  outputDict['created_at'] = user_json['created_at']
  outputDict['description'] = user_json['description']

  try:
    test = user_json['pinned_tweet_id']
    outputDict['hasPinnedTweet'] = True
  except:
    outputDict['hasPinnedTweet'] = False
    pass
  try:
    outputDict['urlsInDescription'] = len(user_json['entities']['description']['urls'])
  except:
    outputDict['urlsInDescription'] = 0
    pass
  try:
    outputDict['hashtagsInDescription'] = len(user_json['entities']['description']['hashtags'])
  except:
    outputDict['hashtagsInDescription'] = 0
    pass

  try:
    outputDict['userWebsitesAdded'] = len(user_json['entities']['url']['urls'])
  except:
    outputDict['userWebsitesAdded'] = 0
    pass

  try:
    outputDict['cashtagsInDescription'] = len(user_json['entities']['description']['cashtags'])
  except:
    outputDict['cashtagsInDescription'] = 0
    pass

  try:
    outputDict['mentionsInDescription'] = len(user_json['entities']['description']['mentions'])
  except:
    outputDict['mentionsInDescription'] = 0
    pass

  
  return outputDict

In [39]:
def get_tweets_user(id, numTweets = 10, tweetsPerPage = 10, replies = False, weights=WEIGHTS, paginationToken = None):
    import math
    
    # (there are 10 results returned per page by default)
    if numTweets < tweetsPerPage:
        print("numTweets must be greater than or equal to the number of tweets per page.")
        return

    # to see data dictionary, click url and scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets
    expansions = {"expansions":"author_id, attachments.poll_ids, attachments.media_keys, entities.mentions.username, geo.place_id, in_reply_to_user_id, referenced_tweets.id,referenced_tweets.id.author_id".replace(" ", "")}
    tweetFields = {"tweet.fields":"attachments, author_id, context_annotations, conversation_id, created_at, entities, geo, id, in_reply_to_user_id, lang, public_metrics, possibly_sensitive, referenced_tweets, reply_settings, source, text, withheld".replace(" ", "")}
    userFields = {"user.fields":"public_metrics,username"}
    replyFields = {"exclude": "replies"}

    outputDict = {'id':[],'handle':[],'followers':[], 'text':[], 'lang':[],'possibly_sensitive':[],'retweet_count':[],'reply_count':[],'like_count':[],'quote_count':[]
        ,'reply_settings':[],'source':[],'created_at':[],'is_retweet':[],'contains_quote':[],'is_reply':[],'num_referenced_tweets':[],
        'url_image':[],'num_hashtags':[],'text_first_hashtag':[],'num_mentions':[],'num_cashtags':[],'num_polls':[]}

    if not replies:
        outputDict['interaction_score'] = []
    
    if paginationToken != None: 
        nextToken = paginationToken
        
    # for each page of results
    for i in range(math.ceil(numTweets/tweetsPerPage)): 
        if i != 0 or paginationToken != None:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":nextToken}}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":nextToken}, **replyFields}
        else:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields, **replyFields}


        tweet_json = (send_request(f"https://api.twitter.com/2/users/{id}/tweets", params=params))
        
        tweetData = tweet_json['data']

        username = tweet_json['includes']['users'][0]['username']
        followers = tweet_json['includes']['users'][0]['public_metrics']['followers_count']

        for tweet in tweetData:
            outputDict['handle'].append(username)
            outputDict['followers'].append(followers)
            outputDict['id'].append(tweet['id'])
            outputDict['text'].append(tweet['text'])
            outputDict['lang'].append(tweet['lang'])
            outputDict['possibly_sensitive'].append(tweet['possibly_sensitive'])

            outputDict['retweet_count'].append(tweet['public_metrics']['retweet_count'])
            outputDict['reply_count'].append(tweet['public_metrics']['reply_count'])
            outputDict['like_count'].append(tweet['public_metrics']['like_count'])
            outputDict['quote_count'].append(tweet['public_metrics']['quote_count'])
            outputDict['reply_settings'].append(tweet['reply_settings'])
            outputDict['source'].append(tweet['source'])
            outputDict['created_at'].append(tweet['created_at'])

            if not replies:
                outputDict['interaction_score'].append((weights['retweet'] *outputDict['retweet_count'][-1] + weights['like'] *outputDict['like_count'][-1]+
                                                        weights['reply'] *outputDict['reply_count'][-1] + weights['quote'] *outputDict['quote_count'][-1])/followers)

            # referenced tweets: quotes, replies, and retweets
            try:
                refdTweets = tweet['referenced_tweets']
                outputDict['num_referenced_tweets'].append(len(refdTweets))

                rtweet = False
                reply = False
                quote = False
                # there may be multiple referenced tweets, apparently. So it could be a reply and contain a quote, I guess
                for t in refdTweets:
                    typ = t['type']
                    if typ == 'retweeted':
                        outputDict['is_retweet'].append(True)
                        rtweet = True
                    elif typ == 'quoted':
                        outputDict['contains_quote'].append(True)
                        quote = True
                    elif typ == 'replied_to':
                        outputDict['is_reply'].append(True)
                        reply = True
                        
                if not rtweet:
                        outputDict['is_retweet'].append(False)
                if not reply:
                        outputDict['is_reply'].append(False)
                if not quote:
                        outputDict['contains_quote'].append(False)
            except:
                outputDict['num_referenced_tweets'].append(0)
                outputDict['is_retweet'].append(False)
                outputDict['contains_quote'].append(False)
                outputDict['is_reply'].append(False)


            # image
            try:
                outputDict['url_image'].append(tweet['entities']['urls'][0]['images'][0]['url'])  #just grabbing the first image in the first url
            except:
                outputDict['url_image'].append("")

            # hashtags
            try:
                outputDict['num_hashtags'].append(len(tweet['entities']['hashtags']))
                # grabbing just the first hashtag
                outputDict['text_first_hashtag'].append(tweet['entities']['hashtags'][0]['tag'])
            except:
                outputDict['num_hashtags'].append(0) 
                outputDict['text_first_hashtag'].append("")

            # mentions
            try: outputDict['num_mentions'].append(len(tweet['entities']['mentions']))
            except: outputDict['num_mentions'].append(0)

            # cashtags
            try: outputDict['num_cashtags'].append(len(tweet['entities']['cashtags']))
            except: outputDict['num_cashtags'].append(0)
            
            # polls
            try: outputDict['num_polls'].append(len(tweet['attachments']['poll_ids']))
            except: outputDict['num_polls'].append(0)


        nextToken = tweet_json['meta']['next_token']
    
    df = pd.DataFrame(outputDict)

    tokenDict = {outputDict['handle'][-1] : nextToken}

    return df, tokenDict


In [45]:
def get_api_data(usernames, replies = False, numTweets = 10, tweetsPerPage = 10, print_status = False):
    import time
    usersDict =  {'following_count':[], 'tweet_count':[], 'followers_count':[], 'listed_count':[], 'username':[], 'name':[], 'id':[], 
    'verified': [], 'protected': [],'created_at': [],'description': [], 'hasPinnedTweet':[], 'urlsInDescription':[], 'hashtagsInDescription':[],
    'userWebsitesAdded':[], 'cashtagsInDescription':[],'mentionsInDescription':[]}

    idDict = {}

    for username in usernames:
        # get data related to user account
        userData = get_user_data(username)
        
        # create id - username mapping dictionary for use in next loop
        idDict[userData['username']] = userData['id']
        
        # build the user data dataframe
        for k, v in userData.items():
            usersDict[k].append(v)

    df_user = pd.DataFrame(usersDict)
    
    if not replies:
        #if we are excluding replies, we want to keep pulling until we have at least 500 images
        numImages = 0
        tokenDict = {}
        firstIteration = True
        loopCount = 1
        df_tweets = None
        while (numImages < 500):
            for username in usernames:
                # get data from tweets of the user

                # if we left off on a page, then jump to the next page of tweet results
                try:
                    nextToken = tokenDict[username]
                    df_sub,tokenD = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies, paginationToken=nextToken)
                except: 
                    df_sub,tokenD = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)

                tokenDict[list(tokenD.keys())[0]] = list(tokenD.values())[0]
                if not firstIteration:
                    df_tweets = pd.concat([df_tweets, df_sub])
                else:
                    df_tweets = df_sub
                    firstIteration = False
                if print_status: 
                    num_tweets = len(df_sub)
                    print(f"Retrieved {num_tweets} tweets for: {username}")
            numImages = len(df_tweets[df_tweets.url_image != ""])
            if print_status: print(f"{loopCount} iterations through while loop. {numImages} images retrieved.")
            if loopCount % 3 == 0: # if it is a multiple of 3 
                # we have a limit of 900 requests per 15 minute window.
                print("waiting 13 minutes...")
                time.sleep(60*13) # wait 13 minutes


            loopCount += 1
            if loopCount == 5: 
                return df_tweets, df_user
    else:
        df_tweets = None
        firstIteration = True
        for username in usernames:
            # get data from tweets of the user
            df_sub = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)[0]
            if not firstIteration:
                df_tweets = pd.concat([df_tweets, df_sub])
            else:
                df_tweets = df_sub
                firstIteration = False
            if print_status: 
                num_tweets = len(df_sub)
                print(f"Retrieved {num_tweets} tweets for: {username}")
        
    

    return df_tweets, df_user



In [41]:
df = pd.read_csv('fast-food-chains - Sheet1.csv')

df.drop(df.columns[0], axis=1, inplace=True)
df.loc[df.shape[0]] = df.columns
df.columns = ["handle"]
twitter_handles = df[df.columns[0]].tolist()
print(len(twitter_handles))

96


In [46]:
reply_tweetDf, userDf = get_api_data(twitter_handles, replies = True, numTweets = 80, tweetsPerPage = 80, print_status=True)
noreply_tweetDf, userDf = get_api_data(twitter_handles, replies = False, numTweets = 300, tweetsPerPage = 100,print_status=True)

Retrieved 80 tweets for: Schlotzskys
Retrieved 80 tweets for: AuntieAnnes
Retrieved 80 tweets for: SaltgrassSteak
Retrieved 80 tweets for: redlobster
Retrieved 80 tweets for: Hardees
Retrieved 80 tweets for: RuthsChris
Retrieved 80 tweets for: LongHornSteaks
Retrieved 80 tweets for: FiveGuys
Retrieved 80 tweets for: Applebees
Retrieved 80 tweets for: DelTaco
Retrieved 80 tweets for: PFChangs
Retrieved 80 tweets for: BonefishGrill
Retrieved 80 tweets for: Charleys
Retrieved 80 tweets for: EinsteinBros
Retrieved 80 tweets for: qdoba
Retrieved 80 tweets for: torchystacos
Retrieved 80 tweets for: raisingcanes
Retrieved 80 tweets for: Cheesecake
Retrieved 80 tweets for: WaffleHouse
Retrieved 80 tweets for: CheckersRallys
Retrieved 80 tweets for: SmoothieKing
Retrieved 80 tweets for: CaptainDs
Retrieved 80 tweets for: WhiteCastle
Retrieved 80 tweets for: papamurphys
Retrieved 80 tweets for: caferio
Retrieved 80 tweets for: ChuysRestaurant
Retrieved 80 tweets for: ChurchsChicken
Retrieved 80 

In [51]:
df = noreply_tweetDf
df['duplicated'] = df.duplicated()

In [55]:
pd.set_option("display.max_rows", 90)
df[df['duplicated'] == True]['handle'].value_counts()

Schlotzskys        600
LongHornSteaks     600
calpizzakitchen    600
CapitalGrille      600
RuthsChris         599
rubytuesday        599
TGIFridays         599
habitburger        599
EatAtPerkins       599
CaptainDs          599
ElPolloLoco        598
SaltgrassSteak     598
TSmoothieCafe      598
JambaJuice         597
Outback            597
Maggianos          597
IHOP               597
ChuysRestaurant    597
RoundTablePizza    597
qdoba              597
Hardees            597
McAlistersDeli     597
caferio            596
SteaknShake        596
bjsrestaurants     596
jasonsdeli         596
CarlsJr            596
tacojohns          596
MellowMushroom     596
OCharleys          595
ColdStone          595
DelTaco            595
MillersAleHouse    595
torchystacos       595
EinsteinBros       595
MarcosPizza        594
MODPizza           594
pollotropical      594
eatatjacks         593
noodlescompany     593
olivegarden        593
BaskinRobbins      593
JetsPizza          593
ZoesKitchen

In [47]:
noreply_tweetDf.to_csv('noReplies.csv')
reply_tweetDf.to_csv('replies.csv')
userDf.to_csv('userData.csv')


In [None]:
pd.set_option("display.max_rows", 50)
# noreply_tweetDf.head(180)

In [None]:
# dfUser = apiData['userData']
# dfUser.to_csv('userData.csv')

userDf.to_csv("userData")

251

In [44]:
# dfUser = apiData['userData']
# for key, value in apiData:
#   value.to_csv(f'{key}.csv')

reply_tweetDf.to_csv("replies.csv")

Unnamed: 0,id,handle,followers,text,lang,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,...,contains_quote,is_reply,num_referenced_tweets,url_image,num_hashtags,text_first_hashtag,num_mentions,num_cashtags,num_polls,interaction_score
0,1470424336033013766,McDonalds,4394661,"okay, it happened. everyone gets a free Big Ma...",en,False,509,343,5331,172,...,False,True,1,,0,,0,0,0,0.001189
1,1470423474846027782,McDonalds,4394661,if @mariahcarey retweets this everyone gets a ...,en,False,1695,700,10658,221,...,False,False,0,,0,,1,0,0,0.002730
2,1469332042232049665,McDonalds,4394661,just got Mariah Carey’s rider and i’ve got thr...,en,False,444,242,2848,109,...,False,False,0,,0,,0,0,0,0.000781
3,1467971503987769347,McDonalds,4394661,ab_ _efghijkl_nopqrstuvwxyz,cs,False,50337,6972,240364,15336,...,False,False,0,,0,,0,0,0,0.077255
4,1467882876737884162,McDonalds,4394661,RT @MariahCarey: Check out my new @McDonalds c...,en,False,865,0,0,0,...,False,False,1,https://pbs.twimg.com/news_img/147041483433651...,0,,2,0,0,0.000590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1339994618952359936,McDonalds,4394661,"nice try, you still get medium fries free in o...",en,False,44,73,958,4,...,False,True,1,,0,,0,0,0,0.000159
236,1339994617937342466,McDonalds,4394661,say free french fry friday three times fast,en,False,141,368,2895,49,...,False,False,0,,0,,0,0,0,0.000554
237,1339670130754793473,McDonalds,4394661,what’s something that ISN’T a McDonald’s Sprit...,en,False,207,1343,4346,378,...,False,False,0,,0,,0,0,0,0.001285
238,1338846744780693515,McDonalds,4394661,we did it. get your free Big Mac on our App\n\...,en,False,147,138,2050,39,...,False,True,1,,0,,0,0,0,0.000400
